diff --git a/src/calibre/utils/tts/piper.cpp b/src/calibre/utils/tts/piper.cpp index 2d2b49989c..6a03bdadde 100644 --- a/src/calibre/utils/tts/piper.cpp +++ b/src/calibre/utils/tts/piper.cpp @@ -50,6 +50,7 @@ static int current_num_speakers = 1; static float current_length_scale = 1; static float current_noise_scale = 1; static float current_noise_w = 1; +static float current_sentence_delay = 0; std::unique_ptr session; std::queue> phoneme_id_queue; std::vector chunk_samples; @@ -158,6 +159,7 @@ set_voice(PyObject *self, PyObject *args) { G(length_scale, current_length_scale, PyFloat_AsDouble); G(noise_scale, current_noise_scale, PyFloat_AsDouble); G(noise_w, current_noise_w, PyFloat_AsDouble); + G(sentence, current_sentence_delay, PyFloat_AsDouble); #undef G PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map"); @@ -341,23 +343,33 @@ next(PyObject *self, PyObject *args) { audio_tensor_data = output_tensors.front().GetTensorData(); Py_END_ALLOW_THREADS; - PyObject *ans = NULL; + PyObject *ans = NULL, *data = NULL; + int num_of_silence_samples = 0; + if (current_sentence_delay > 0) num_of_silence_samples = current_sample_rate * current_sentence_delay; if (as_16bit_samples) { - PyObject *data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * num_samples); + data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * (num_samples + num_of_silence_samples)); if (data) { - int16_t *x = (int16_t*)PyBytes_AS_STRING(data); Py_BEGIN_ALLOW_THREADS; + int16_t *x = (int16_t*)PyBytes_AS_STRING(data); for (int i = 0; i < num_samples; i++) { x[i] = std::max(-1.f, std::min(audio_tensor_data[i], 1.f)) * std::numeric_limits::max(); } + memset(x + num_samples, 0, num_of_silence_samples * sizeof(int16_t)); Py_END_ALLOW_THREADS; - ans = Py_BuildValue( - "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate, - phoneme_id_queue.empty() ? Py_True : Py_False); - } + } } else { + data = PyBytes_FromStringAndSize(NULL, sizeof(float) * (num_samples * num_of_silence_samples)); + if (data) { + Py_BEGIN_ALLOW_THREADS; + float *x = (float*)PyBytes_AS_STRING(data); + memcpy(x, audio_tensor_data, sizeof(float) * num_samples); + memset(x + num_samples, 0, num_of_silence_samples * sizeof(int16_t)); + Py_END_ALLOW_THREADS; + } + } + if (data) { ans = Py_BuildValue( - "y#iiO", audio_tensor_data, sizeof(float)*num_samples, num_samples, current_sample_rate, + "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False); } diff --git a/src/calibre/utils/tts/piper.py b/src/calibre/utils/tts/piper.py index f51d584bc8..35a97eb4be 100644 --- a/src/calibre/utils/tts/piper.py +++ b/src/calibre/utils/tts/piper.py @@ -19,8 +19,7 @@ class VoiceConfig(NamedTuple): noise_scale: float noise_w: float num_speakers: int - - sentence_delay: float = 0 + sentence_delay: float def translate_voice_config(x: Any) -> VoiceConfig: @@ -56,8 +55,9 @@ def espeak_data_dir() -> str: return '' # TODO: get the correct path when using frozen builds -def set_voice(config_path: str, model_path:str, length_scale_multiplier: float, sentence_delay: float) -> None: +def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None: piper.initialize(espeak_data_dir()) cfg = load_voice_config(config_path) - cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * length_scale_multiplier) + m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1 + cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m) piper.set_voice(cfg, model_path)