mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Implement sentence silence
This commit is contained in:
parent
46002017cd
commit
7b19e19e29
@ -50,6 +50,7 @@ static int current_num_speakers = 1;
|
|||||||
static float current_length_scale = 1;
|
static float current_length_scale = 1;
|
||||||
static float current_noise_scale = 1;
|
static float current_noise_scale = 1;
|
||||||
static float current_noise_w = 1;
|
static float current_noise_w = 1;
|
||||||
|
static float current_sentence_delay = 0;
|
||||||
std::unique_ptr<Ort::Session> session;
|
std::unique_ptr<Ort::Session> session;
|
||||||
std::queue<std::vector<PhonemeId>> phoneme_id_queue;
|
std::queue<std::vector<PhonemeId>> phoneme_id_queue;
|
||||||
std::vector<float> chunk_samples;
|
std::vector<float> chunk_samples;
|
||||||
@ -158,6 +159,7 @@ set_voice(PyObject *self, PyObject *args) {
|
|||||||
G(length_scale, current_length_scale, PyFloat_AsDouble);
|
G(length_scale, current_length_scale, PyFloat_AsDouble);
|
||||||
G(noise_scale, current_noise_scale, PyFloat_AsDouble);
|
G(noise_scale, current_noise_scale, PyFloat_AsDouble);
|
||||||
G(noise_w, current_noise_w, PyFloat_AsDouble);
|
G(noise_w, current_noise_w, PyFloat_AsDouble);
|
||||||
|
G(sentence, current_sentence_delay, PyFloat_AsDouble);
|
||||||
#undef G
|
#undef G
|
||||||
|
|
||||||
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
|
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
|
||||||
@ -341,23 +343,33 @@ next(PyObject *self, PyObject *args) {
|
|||||||
audio_tensor_data = output_tensors.front().GetTensorData<float>();
|
audio_tensor_data = output_tensors.front().GetTensorData<float>();
|
||||||
Py_END_ALLOW_THREADS;
|
Py_END_ALLOW_THREADS;
|
||||||
|
|
||||||
PyObject *ans = NULL;
|
PyObject *ans = NULL, *data = NULL;
|
||||||
|
int num_of_silence_samples = 0;
|
||||||
|
if (current_sentence_delay > 0) num_of_silence_samples = current_sample_rate * current_sentence_delay;
|
||||||
if (as_16bit_samples) {
|
if (as_16bit_samples) {
|
||||||
PyObject *data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * num_samples);
|
data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * (num_samples + num_of_silence_samples));
|
||||||
if (data) {
|
if (data) {
|
||||||
int16_t *x = (int16_t*)PyBytes_AS_STRING(data);
|
|
||||||
Py_BEGIN_ALLOW_THREADS;
|
Py_BEGIN_ALLOW_THREADS;
|
||||||
|
int16_t *x = (int16_t*)PyBytes_AS_STRING(data);
|
||||||
for (int i = 0; i < num_samples; i++) {
|
for (int i = 0; i < num_samples; i++) {
|
||||||
x[i] = std::max(-1.f, std::min(audio_tensor_data[i], 1.f)) * std::numeric_limits<int16_t>::max();
|
x[i] = std::max(-1.f, std::min(audio_tensor_data[i], 1.f)) * std::numeric_limits<int16_t>::max();
|
||||||
}
|
}
|
||||||
|
memset(x + num_samples, 0, num_of_silence_samples * sizeof(int16_t));
|
||||||
Py_END_ALLOW_THREADS;
|
Py_END_ALLOW_THREADS;
|
||||||
ans = Py_BuildValue(
|
}
|
||||||
"NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
|
|
||||||
phoneme_id_queue.empty() ? Py_True : Py_False);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
|
data = PyBytes_FromStringAndSize(NULL, sizeof(float) * (num_samples * num_of_silence_samples));
|
||||||
|
if (data) {
|
||||||
|
Py_BEGIN_ALLOW_THREADS;
|
||||||
|
float *x = (float*)PyBytes_AS_STRING(data);
|
||||||
|
memcpy(x, audio_tensor_data, sizeof(float) * num_samples);
|
||||||
|
memset(x + num_samples, 0, num_of_silence_samples * sizeof(int16_t));
|
||||||
|
Py_END_ALLOW_THREADS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (data) {
|
||||||
ans = Py_BuildValue(
|
ans = Py_BuildValue(
|
||||||
"y#iiO", audio_tensor_data, sizeof(float)*num_samples, num_samples, current_sample_rate,
|
"NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
|
||||||
phoneme_id_queue.empty() ? Py_True : Py_False);
|
phoneme_id_queue.empty() ? Py_True : Py_False);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,8 +19,7 @@ class VoiceConfig(NamedTuple):
|
|||||||
noise_scale: float
|
noise_scale: float
|
||||||
noise_w: float
|
noise_w: float
|
||||||
num_speakers: int
|
num_speakers: int
|
||||||
|
sentence_delay: float
|
||||||
sentence_delay: float = 0
|
|
||||||
|
|
||||||
|
|
||||||
def translate_voice_config(x: Any) -> VoiceConfig:
|
def translate_voice_config(x: Any) -> VoiceConfig:
|
||||||
@ -56,8 +55,9 @@ def espeak_data_dir() -> str:
|
|||||||
return '' # TODO: get the correct path when using frozen builds
|
return '' # TODO: get the correct path when using frozen builds
|
||||||
|
|
||||||
|
|
||||||
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float, sentence_delay: float) -> None:
|
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
|
||||||
piper.initialize(espeak_data_dir())
|
piper.initialize(espeak_data_dir())
|
||||||
cfg = load_voice_config(config_path)
|
cfg = load_voice_config(config_path)
|
||||||
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * length_scale_multiplier)
|
m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1
|
||||||
|
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
|
||||||
piper.set_voice(cfg, model_path)
|
piper.set_voice(cfg, model_path)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user