From 732d2c95e83f050876c4f2ddae590b1bd3584455 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 28 Jul 2025 11:07:54 +0530 Subject: [PATCH] API to synthesize text --- src/calibre/utils/tts/piper.cpp | 219 ++++++++++++++++++++++++++++++-- 1 file changed, 208 insertions(+), 11 deletions(-) diff --git a/src/calibre/utils/tts/piper.cpp b/src/calibre/utils/tts/piper.cpp index de363d26a4..f4ee41459b 100644 --- a/src/calibre/utils/tts/piper.cpp +++ b/src/calibre/utils/tts/piper.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #define CLAUSE_INTONATION_FULL_STOP 0x00000000 #define CLAUSE_INTONATION_COMMA 0x00001000 @@ -33,8 +34,12 @@ typedef char32_t Phoneme; typedef int64_t PhonemeId; typedef int64_t SpeakerId; typedef std::map> PhonemeIdMap; +const PhonemeId ID_PAD = 0; // interleaved +const PhonemeId ID_BOS = 1; // beginning of sentence +const PhonemeId ID_EOS = 2; // end of sentence static bool initialized = false, voice_set = false; +PyObject *normalize_func = NULL; static char espeak_data_dir[512] = {0}; static PhonemeIdMap current_phoneme_id_map; static int current_sample_rate = 0; @@ -43,6 +48,11 @@ static float current_length_scale = 1; static float current_noise_scale = 1; static float current_noise_w = 1; std::unique_ptr session; +std::queue> phoneme_id_queue; +std::vector chunk_samples; +static struct { + PyObject *func, *args; +} normalize_data = {0}; static PyObject* initialize(PyObject *self, PyObject *args) { @@ -53,8 +63,17 @@ initialize(PyObject *self, PyObject *args) { PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : ""); return NULL; } + Py_CLEAR(normalize_data.func); Py_CLEAR(normalize_data.args); initialized = true; snprintf(espeak_data_dir, sizeof(espeak_data_dir), "%s", path); + PyObject *unicodedata = PyImport_ImportModule("unicodedata"); + if (!unicodedata) return NULL; + normalize_data.func = PyObject_GetAttrString(unicodedata, "normalize"); + Py_CLEAR(unicodedata); + if (!normalize_data.func) return NULL; + normalize_data.args = Py_BuildValue("(ss)", "NFD", ""); + if (!normalize_data.args) return NULL; + Py_DECREF(unicodedata); } Py_RETURN_NONE; } @@ -67,9 +86,25 @@ set_espeak_voice_by_name(PyObject *self, PyObject *pyname) { PyErr_Format(PyExc_ValueError, "failed to set espeak voice: %U", pyname); return NULL; } + voice_set = true; Py_RETURN_NONE; } +static const char* +categorize_terminator(int terminator) { + const char *terminator_str = ""; + terminator &= 0x000FFFFF; + switch(terminator) { + case CLAUSE_PERIOD: terminator_str = "."; break; + case CLAUSE_QUESTION: terminator_str = "?"; break; + case CLAUSE_EXCLAMATION: terminator_str = "!"; break; + case CLAUSE_COMMA: terminator_str = ","; break; + case CLAUSE_COLON: terminator_str = ":"; break; + case CLAUSE_SEMICOLON: terminator_str = ";"; break; + } + return terminator_str; +} + static PyObject* phonemize(PyObject *self, PyObject *pytext) { if (!PyUnicode_Check(pytext)) { PyErr_SetString(PyExc_TypeError, "text must be a unicode string"); return NULL; } @@ -81,21 +116,13 @@ phonemize(PyObject *self, PyObject *pytext) { while (text != NULL) { int terminator = 0; - const char *terminator_str = "", *phonemes; + const char *phonemes; Py_BEGIN_ALLOW_THREADS; phonemes = espeak_TextToPhonemesWithTerminator( (const void **)&text, espeakCHARS_UTF8, espeakPHONEMES_IPA, &terminator); Py_END_ALLOW_THREADS; // Categorize terminator - terminator &= 0x000FFFFF; - switch(terminator) { - case CLAUSE_PERIOD: terminator_str = "."; break; - case CLAUSE_QUESTION: terminator_str = "?"; break; - case CLAUSE_EXCLAMATION: terminator_str = "!"; break; - case CLAUSE_COMMA: terminator_str = ","; break; - case CLAUSE_COLON: terminator_str = ":"; break; - case CLAUSE_SEMICOLON: terminator_str = ";"; break; - } + const char *terminator_str = categorize_terminator(terminator); PyObject *item = Py_BuildValue("(ssO)", phonemes, terminator_str, (terminator & CLAUSE_TYPE_SENTENCE) != 0 ? Py_True : Py_False); if (item == NULL) { Py_CLEAR(phonemes_and_terminators); return NULL; } int ret = PyList_Append(phonemes_and_terminators, item); @@ -146,6 +173,7 @@ set_voice(PyObject *self, PyObject *args) { Py_CLEAR(map); // Load onnx model + Py_BEGIN_ALLOW_THREADS; Ort::SessionOptions opts; opts.DisableCpuMemArena(); opts.DisableMemPattern(); @@ -153,19 +181,187 @@ set_voice(PyObject *self, PyObject *args) { Ort::Env ort_env{ORT_LOGGING_LEVEL_WARNING, "piper"}; session.reset(); session = std::make_unique(Ort::Session(ort_env, model_path, opts)); + Py_END_ALLOW_THREADS; Py_RETURN_NONE; } +static PyObject* +normalize(const char *text) { + PyObject *t = PyUnicode_FromString(text); + if (!t) return NULL; + if (PyTuple_SetItem(normalize_data.args, 1, t) != 0) { + Py_DECREF(t); + return NULL; + } + return PyObject_CallObject(normalize_data.func, normalize_data.args); +} + +static PyObject* +start(PyObject *self, PyObject *args) { + const char *text; + if (!PyArg_ParseTuple(args, "s", &text)) return NULL; + if (!voice_set || session.get() == NULL) { PyErr_SetString(PyExc_Exception, "must call set_voice() first"); return NULL; } + // Clear state + while (!phoneme_id_queue.empty()) phoneme_id_queue.pop(); + chunk_samples.clear(); + + // Convert to phonemes + std::vector sentence_phonemes{""}; + Py_BEGIN_ALLOW_THREADS; + std::size_t current_idx = 0; + const void *text_ptr = text; + while (text_ptr != nullptr) { + int terminator = 0; + const char *phonemes = espeak_TextToPhonemesWithTerminator( + &text_ptr, espeakCHARS_UTF8, espeakPHONEMES_IPA, &terminator); + if (phonemes) sentence_phonemes[current_idx] += phonemes; + const char *terminator_str = categorize_terminator(terminator); + sentence_phonemes[current_idx] += terminator_str; + if ((terminator & CLAUSE_TYPE_SENTENCE) == CLAUSE_TYPE_SENTENCE) { + sentence_phonemes.push_back(""); + current_idx = sentence_phonemes.size() - 1; + } + } + Py_END_ALLOW_THREADS; + + // phonemes to ids + std::vector sentence_ids; + for (auto &phonemes_str : sentence_phonemes) { + if (phonemes_str.empty()) continue; + sentence_ids.push_back(ID_BOS); + sentence_ids.push_back(ID_PAD); + + PyObject *normalized_text = normalize(phonemes_str.c_str()); + if (!normalized_text) return NULL; + int kind = PyUnicode_KIND(normalized_text); void *data = PyUnicode_DATA(normalized_text); + + // Filter out (lang) switch (flags). + // These surround words from languages other than the current voice. + bool in_lang_flag = false; + for (Py_ssize_t i = 0; i < PyUnicode_GET_LENGTH(normalized_text); i++) { + char32_t ch = PyUnicode_READ(kind, data, i); + if (in_lang_flag) { + if (ch == U')') { + // End of (lang) switch + in_lang_flag = false; + } + } else if (ch == U'(') { + // Start of (lang) switch + in_lang_flag = true; + } else { + // Look up ids + auto ids_for_phoneme = current_phoneme_id_map.find(ch); + if (ids_for_phoneme != current_phoneme_id_map.end()) { + for (auto id : ids_for_phoneme->second) { + sentence_ids.push_back(id); + sentence_ids.push_back(ID_PAD); + } + } + } + } + Py_CLEAR(normalized_text); + sentence_ids.push_back(ID_EOS); + phoneme_id_queue.emplace(std::move(sentence_ids)); + sentence_ids.clear(); + } + Py_RETURN_NONE; +} + +static PyObject* +next(PyObject *self, PyObject *args) { + if (phoneme_id_queue.empty()) return Py_BuildValue("yiiO", "", 0, current_sample_rate, Py_True); + std::vector output_tensors; + std::vector input_tensors; + + Py_BEGIN_ALLOW_THREADS; + // Process next list of phoneme ids + auto next_ids = std::move(phoneme_id_queue.front()); + phoneme_id_queue.pop(); + + auto memoryInfo = Ort::MemoryInfo::CreateCpu( + OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); + + // Allocate + std::vector phoneme_id_lengths{(int64_t)next_ids.size()}; + std::vector scales{current_noise_scale, current_length_scale, current_noise_w}; + + std::vector phoneme_ids_shape{1, (int64_t)next_ids.size()}; + input_tensors.push_back(Ort::Value::CreateTensor( + memoryInfo, next_ids.data(), next_ids.size(), phoneme_ids_shape.data(), + phoneme_ids_shape.size())); + + std::vector phoneme_id_lengths_shape{ + (int64_t)phoneme_id_lengths.size()}; + input_tensors.push_back(Ort::Value::CreateTensor( + memoryInfo, phoneme_id_lengths.data(), phoneme_id_lengths.size(), + phoneme_id_lengths_shape.data(), phoneme_id_lengths_shape.size())); + + std::vector scales_shape{(int64_t)scales.size()}; + input_tensors.push_back(Ort::Value::CreateTensor( + memoryInfo, scales.data(), scales.size(), scales_shape.data(), + scales_shape.size())); + + // Add speaker id. + // NOTE: These must be kept outside the "if" below to avoid being + // deallocated. + std::vector speaker_id{(int64_t)0}; + std::vector speaker_id_shape{(int64_t)speaker_id.size()}; + + if (current_num_speakers > 1) { + input_tensors.push_back(Ort::Value::CreateTensor( + memoryInfo, speaker_id.data(), speaker_id.size(), + speaker_id_shape.data(), speaker_id_shape.size())); + } + + // From export_onnx.py + std::array input_names = {"input", "input_lengths", "scales", "sid"}; + std::array output_names = {"output"}; + + // Infer + output_tensors = session->Run( + Ort::RunOptions{nullptr}, input_names.data(), input_tensors.data(), + input_tensors.size(), output_names.data(), output_names.size()); + Py_END_ALLOW_THREADS; + + if ((output_tensors.size() != 1) || (!output_tensors.front().IsTensor())) { + PyErr_SetString(PyExc_ValueError, "failed to infer audio data from list of phoneme ids"); + return NULL; + } + + auto audio_shape = + output_tensors.front().GetTensorTypeAndShapeInfo().GetShape(); + int num_samples = audio_shape[audio_shape.size() - 1]; + + const float *audio_tensor_data = output_tensors.front().GetTensorData(); + PyObject *ans = Py_BuildValue("y#iiO", audio_tensor_data, sizeof(float)*num_samples, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False); + + // Clean up + for (std::size_t i = 0; i < output_tensors.size(); i++) { + Ort::detail::OrtRelease(output_tensors[i].release()); + } + for (std::size_t i = 0; i < input_tensors.size(); i++) { + Ort::detail::OrtRelease(input_tensors[i].release()); + } + return ans; +} + // Boilerplate {{{ static char doc[] = "Text to speech using the Piper TTS models"; static PyMethodDef methods[] = { {"initialize", (PyCFunction)initialize, METH_VARARGS, - "initialize(espeak_data_dir) -> Initialize this module. Must be called once before using any other functions from this module. If espeak_data_dir is not specified or is the mepty string the default data location is used." + "initialize(espeak_data_dir) -> Initialize this module. Must be called once before using any other functions from this module. If espeak_data_dir is not specified or is the empty string the default data location is used." }, {"set_voice", (PyCFunction)set_voice, METH_VARARGS, "set_voice(voice_config, model_path) -> Load the model in preparation for synthesis." }, + {"start", (PyCFunction)start, METH_VARARGS, + "start(text) -> Start synthesizing the specified text, call next() repeatedly to get the audiodata." + }, + {"next", (PyCFunction)next, METH_NOARGS, + "next() -> Return the next chunk of audio data (audio_data, num_samples, sample_rate, is_last). Here audio_data is a bytes object consisting of an array of floats in native endianness." + }, + {"set_espeak_voice_by_name", (PyCFunction)set_espeak_voice_by_name, METH_O, "set_espeak_voice_by_name(name) -> Set the voice to be used to phonemize text" }, @@ -193,6 +389,7 @@ cleanup_module(void*) { } current_phoneme_id_map.clear(); session.reset(); + Py_CLEAR(normalize_data.func); Py_CLEAR(normalize_data.args); } CALIBRE_MODINIT_FUNC PyInit_piper(void) {