diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 7ac71da553..dc7c0ff11f 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -507,19 +507,3 @@ def bundled_binaries_dir() -> str: if (islinux or isbsd) and getattr(sys, 'frozen', False): return os.path.join(sys.executables_location, 'bin') return '' - - -@lru_cache(2) -def piper_cmdline() -> tuple[str, ...]: - ext = '.exe' if iswindows else '' - if bbd := bundled_binaries_dir(): - if ismacos: - return (os.path.join(sys.frameworks_dir, 'piper', 'piper'),) - return (os.path.join(bbd, 'piper', 'piper' + ext),) - if pd := os.environ.get('PIPER_TTS_DIR'): - return (os.path.join(pd, 'piper' + ext),) - import shutil - exe = shutil.which('piper-tts') - if exe: - return (exe,) - return () diff --git a/src/calibre/utils/tts/piper.cpp b/src/calibre/utils/tts/piper.cpp index aa492ae76c..de363d26a4 100644 --- a/src/calibre/utils/tts/piper.cpp +++ b/src/calibre/utils/tts/piper.cpp @@ -8,6 +8,10 @@ #include #include +#include +#include +#include +#include #define CLAUSE_INTONATION_FULL_STOP 0x00000000 #define CLAUSE_INTONATION_COMMA 0x00001000 @@ -25,27 +29,42 @@ #define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE) #define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE) +typedef char32_t Phoneme; +typedef int64_t PhonemeId; +typedef int64_t SpeakerId; +typedef std::map> PhonemeIdMap; + static bool initialized = false, voice_set = false; +static char espeak_data_dir[512] = {0}; +static PhonemeIdMap current_phoneme_id_map; +static int current_sample_rate = 0; +static int current_num_speakers = 1; +static float current_length_scale = 1; +static float current_noise_scale = 1; +static float current_noise_w = 1; +std::unique_ptr session; static PyObject* initialize(PyObject *self, PyObject *args) { - const char *path = NULL; + const char *path = ""; if (!PyArg_ParseTuple(args, "|s", &path)) return NULL; - if (initialized) { PyErr_SetString(PyExc_Exception, "initialize() already called"); return NULL; } - if (path && !path[0]) path = NULL; // use default path - if (espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, path, 0) < 0) { - PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : ""); - return NULL; + if (!initialized || strcmp(espeak_data_dir, path) != 0) { + if (espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, path && path[0] ? path : NULL, 0) < 0) { + PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : ""); + return NULL; + } + initialized = true; + snprintf(espeak_data_dir, sizeof(espeak_data_dir), "%s", path); } Py_RETURN_NONE; } static PyObject* set_espeak_voice_by_name(PyObject *self, PyObject *pyname) { - if (!PyUnicode_Check(pyname)) { PyErr_SetString(PyExc_TypeError, "name must be a unicode string"); return NULL; } + if (!PyUnicode_Check(pyname)) { PyErr_SetString(PyExc_TypeError, "espeak voice name must be a unicode string"); return NULL; } if (!initialized) { PyErr_SetString(PyExc_Exception, "must call initialize() first"); return NULL; } if (espeak_SetVoiceByName(PyUnicode_AsUTF8(pyname)) < 0) { - PyErr_Format(PyExc_ValueError, "failed to set voice: %U", pyname); + PyErr_Format(PyExc_ValueError, "failed to set espeak voice: %U", pyname); return NULL; } Py_RETURN_NONE; @@ -86,12 +105,67 @@ phonemize(PyObject *self, PyObject *pytext) { return phonemes_and_terminators; } +static PyObject* +set_voice(PyObject *self, PyObject *args) { + PyObject *cfg; const char *model_path; + if (!PyArg_ParseTuple(args, "Os", &cfg, &model_path)) return NULL; + + PyObject *evn = PyObject_GetAttrString(cfg, "espeak_voice_name"); + if (!evn) return NULL; + PyObject *ret = set_espeak_voice_by_name(NULL, evn); + Py_CLEAR(evn); + if (ret == NULL) return NULL; + Py_DECREF(ret); + +#define G(name, dest, conv) { \ + PyObject *sr = PyObject_GetAttrString(cfg, #name); \ + if (!sr) return NULL; \ + dest = conv(sr); \ + Py_CLEAR(sr); \ +} + G(sample_rate, current_sample_rate, PyLong_AsLong); + G(num_speakers, current_num_speakers, PyLong_AsLong); + G(length_scale, current_length_scale, PyFloat_AsDouble); + G(noise_scale, current_noise_scale, PyFloat_AsDouble); + G(noise_w, current_noise_w, PyFloat_AsDouble); +#undef G + + PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map"); + if (!map) return NULL; + current_phoneme_id_map.clear(); + PyObject *key, *value; Py_ssize_t pos = 0; + while (PyDict_Next(map, &pos, &key, &value)) { + unsigned long cp = PyLong_AsUnsignedLong(key); + std::vector ids; + for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) { + unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i)); + ids.push_back(id); + } + current_phoneme_id_map[cp] = ids; + } + Py_CLEAR(map); + + // Load onnx model + Ort::SessionOptions opts; + opts.DisableCpuMemArena(); + opts.DisableMemPattern(); + opts.DisableProfiling(); + Ort::Env ort_env{ORT_LOGGING_LEVEL_WARNING, "piper"}; + session.reset(); + session = std::make_unique(Ort::Session(ort_env, model_path, opts)); + + Py_RETURN_NONE; +} + // Boilerplate {{{ static char doc[] = "Text to speech using the Piper TTS models"; static PyMethodDef methods[] = { {"initialize", (PyCFunction)initialize, METH_VARARGS, "initialize(espeak_data_dir) -> Initialize this module. Must be called once before using any other functions from this module. If espeak_data_dir is not specified or is the mepty string the default data location is used." }, + {"set_voice", (PyCFunction)set_voice, METH_VARARGS, + "set_voice(voice_config, model_path) -> Load the model in preparation for synthesis." + }, {"set_espeak_voice_by_name", (PyCFunction)set_espeak_voice_by_name, METH_O, "set_espeak_voice_by_name(name) -> Set the voice to be used to phonemize text" }, @@ -117,6 +191,8 @@ cleanup_module(void*) { voice_set = false; espeak_Terminate(); } + current_phoneme_id_map.clear(); + session.reset(); } CALIBRE_MODINIT_FUNC PyInit_piper(void) { diff --git a/src/calibre/utils/tts/piper.py b/src/calibre/utils/tts/piper.py index 88610e3f37..f51d584bc8 100644 --- a/src/calibre/utils/tts/piper.py +++ b/src/calibre/utils/tts/piper.py @@ -1,6 +1,63 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2025, Kovid Goyal +import json +from typing import Any, NamedTuple + import calibre_extensions.piper as piper -piper +DEFAULT_LENGTH_SCALE = 1.0 +DEFAULT_NOISE_SCALE = 0.667 +DEFAULT_NOISE_W_SCALE = 0.8 + + +class VoiceConfig(NamedTuple): + espeak_voice_name: str + sample_rate: int + phoneme_id_map: dict[int, list[int]] + length_scale: float + noise_scale: float + noise_w: float + num_speakers: int + + sentence_delay: float = 0 + + +def translate_voice_config(x: Any) -> VoiceConfig: + phoneme_id_map: dict[int, list[int]] = {} + for s, pid in x.get('phoneme_id_map', {}).items(): + if s: + phoneme_id_map.setdefault(ord(s[0]), []).append(pid) + inf = x.get('inference') + + def g(d, prop, defval): + ans = d.get(prop, VoiceConfig) + if ans is VoiceConfig: + ans = defval + return ans + + return VoiceConfig( + espeak_voice_name=x.get('espeak', {}).get('voice') or 'en-us', + sample_rate=int(g(x.get('audio', {}), 'sample_rate', 22050)), + phoneme_id_map=phoneme_id_map, + length_scale=float(g(inf, 'length_scale', DEFAULT_LENGTH_SCALE)), + noise_scale=float(g(inf, 'noise_scale', DEFAULT_NOISE_SCALE)), + noise_w=float(g(inf, 'noise_w', DEFAULT_NOISE_W_SCALE)), + num_speakers=int(g(x, 'num_speakers', 1)), + ) + + +def load_voice_config(path: str) -> VoiceConfig: + with open(path, 'rb') as f: + return translate_voice_config(json.load(f)) + + +def espeak_data_dir() -> str: + return '' # TODO: get the correct path when using frozen builds + + +def set_voice(config_path: str, model_path:str, length_scale_multiplier: float, sentence_delay: float) -> None: + piper.initialize(espeak_data_dir()) + cfg = load_voice_config(config_path) + cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * length_scale_multiplier) + piper.set_voice(cfg, model_path)