Implement sentence silence

This commit is contained in:
Kovid Goyal 2025-07-29 08:06:26 +05:30
parent 46002017cd
commit 7b19e19e29
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 24 additions and 12 deletions

View File

@ -50,6 +50,7 @@ static int current_num_speakers = 1;
static float current_length_scale = 1; static float current_length_scale = 1;
static float current_noise_scale = 1; static float current_noise_scale = 1;
static float current_noise_w = 1; static float current_noise_w = 1;
static float current_sentence_delay = 0;
std::unique_ptr<Ort::Session> session; std::unique_ptr<Ort::Session> session;
std::queue<std::vector<PhonemeId>> phoneme_id_queue; std::queue<std::vector<PhonemeId>> phoneme_id_queue;
std::vector<float> chunk_samples; std::vector<float> chunk_samples;
@ -158,6 +159,7 @@ set_voice(PyObject *self, PyObject *args) {
G(length_scale, current_length_scale, PyFloat_AsDouble); G(length_scale, current_length_scale, PyFloat_AsDouble);
G(noise_scale, current_noise_scale, PyFloat_AsDouble); G(noise_scale, current_noise_scale, PyFloat_AsDouble);
G(noise_w, current_noise_w, PyFloat_AsDouble); G(noise_w, current_noise_w, PyFloat_AsDouble);
G(sentence, current_sentence_delay, PyFloat_AsDouble);
#undef G #undef G
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map"); PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
@ -341,23 +343,33 @@ next(PyObject *self, PyObject *args) {
audio_tensor_data = output_tensors.front().GetTensorData<float>(); audio_tensor_data = output_tensors.front().GetTensorData<float>();
Py_END_ALLOW_THREADS; Py_END_ALLOW_THREADS;
PyObject *ans = NULL; PyObject *ans = NULL, *data = NULL;
int num_of_silence_samples = 0;
if (current_sentence_delay > 0) num_of_silence_samples = current_sample_rate * current_sentence_delay;
if (as_16bit_samples) { if (as_16bit_samples) {
PyObject *data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * num_samples); data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * (num_samples + num_of_silence_samples));
if (data) { if (data) {
int16_t *x = (int16_t*)PyBytes_AS_STRING(data);
Py_BEGIN_ALLOW_THREADS; Py_BEGIN_ALLOW_THREADS;
int16_t *x = (int16_t*)PyBytes_AS_STRING(data);
for (int i = 0; i < num_samples; i++) { for (int i = 0; i < num_samples; i++) {
x[i] = std::max(-1.f, std::min(audio_tensor_data[i], 1.f)) * std::numeric_limits<int16_t>::max(); x[i] = std::max(-1.f, std::min(audio_tensor_data[i], 1.f)) * std::numeric_limits<int16_t>::max();
} }
memset(x + num_samples, 0, num_of_silence_samples * sizeof(int16_t));
Py_END_ALLOW_THREADS; Py_END_ALLOW_THREADS;
ans = Py_BuildValue( }
"NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
phoneme_id_queue.empty() ? Py_True : Py_False);
}
} else { } else {
data = PyBytes_FromStringAndSize(NULL, sizeof(float) * (num_samples * num_of_silence_samples));
if (data) {
Py_BEGIN_ALLOW_THREADS;
float *x = (float*)PyBytes_AS_STRING(data);
memcpy(x, audio_tensor_data, sizeof(float) * num_samples);
memset(x + num_samples, 0, num_of_silence_samples * sizeof(int16_t));
Py_END_ALLOW_THREADS;
}
}
if (data) {
ans = Py_BuildValue( ans = Py_BuildValue(
"y#iiO", audio_tensor_data, sizeof(float)*num_samples, num_samples, current_sample_rate, "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
phoneme_id_queue.empty() ? Py_True : Py_False); phoneme_id_queue.empty() ? Py_True : Py_False);
} }

View File

@ -19,8 +19,7 @@ class VoiceConfig(NamedTuple):
noise_scale: float noise_scale: float
noise_w: float noise_w: float
num_speakers: int num_speakers: int
sentence_delay: float
sentence_delay: float = 0
def translate_voice_config(x: Any) -> VoiceConfig: def translate_voice_config(x: Any) -> VoiceConfig:
@ -56,8 +55,9 @@ def espeak_data_dir() -> str:
return '' # TODO: get the correct path when using frozen builds return '' # TODO: get the correct path when using frozen builds
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float, sentence_delay: float) -> None: def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
piper.initialize(espeak_data_dir()) piper.initialize(espeak_data_dir())
cfg = load_voice_config(config_path) cfg = load_voice_config(config_path)
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * length_scale_multiplier) m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
piper.set_voice(cfg, model_path) piper.set_voice(cfg, model_path)