From 46002017cdd6a739b7643ff7db4896681a67afe2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 28 Jul 2025 11:43:41 +0530 Subject: [PATCH] Get data in PCM format by default --- src/calibre/utils/tts/piper.cpp | 39 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/calibre/utils/tts/piper.cpp b/src/calibre/utils/tts/piper.cpp index f4ee41459b..2d2b49989c 100644 --- a/src/calibre/utils/tts/piper.cpp +++ b/src/calibre/utils/tts/piper.cpp @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #define CLAUSE_INTONATION_FULL_STOP 0x00000000 #define CLAUSE_INTONATION_COMMA 0x00001000 @@ -270,6 +273,8 @@ start(PyObject *self, PyObject *args) { static PyObject* next(PyObject *self, PyObject *args) { + int as_16bit_samples = 1; + if (!PyArg_ParseTuple(args, "|p", &as_16bit_samples)) return NULL; if (phoneme_id_queue.empty()) return Py_BuildValue("yiiO", "", 0, current_sample_rate, Py_True); std::vector output_tensors; std::vector input_tensors; @@ -329,12 +334,32 @@ next(PyObject *self, PyObject *args) { return NULL; } - auto audio_shape = - output_tensors.front().GetTensorTypeAndShapeInfo().GetShape(); - int num_samples = audio_shape[audio_shape.size() - 1]; + int num_samples; const float *audio_tensor_data; + Py_BEGIN_ALLOW_THREADS; + auto audio_shape = output_tensors.front().GetTensorTypeAndShapeInfo().GetShape(); + num_samples = audio_shape[audio_shape.size() - 1]; + audio_tensor_data = output_tensors.front().GetTensorData(); + Py_END_ALLOW_THREADS; - const float *audio_tensor_data = output_tensors.front().GetTensorData(); - PyObject *ans = Py_BuildValue("y#iiO", audio_tensor_data, sizeof(float)*num_samples, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False); + PyObject *ans = NULL; + if (as_16bit_samples) { + PyObject *data = PyBytes_FromStringAndSize(NULL, sizeof(int16_t) * num_samples); + if (data) { + int16_t *x = (int16_t*)PyBytes_AS_STRING(data); + Py_BEGIN_ALLOW_THREADS; + for (int i = 0; i < num_samples; i++) { + x[i] = std::max(-1.f, std::min(audio_tensor_data[i], 1.f)) * std::numeric_limits::max(); + } + Py_END_ALLOW_THREADS; + ans = Py_BuildValue( + "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate, + phoneme_id_queue.empty() ? Py_True : Py_False); + } + } else { + ans = Py_BuildValue( + "y#iiO", audio_tensor_data, sizeof(float)*num_samples, num_samples, current_sample_rate, + phoneme_id_queue.empty() ? Py_True : Py_False); + } // Clean up for (std::size_t i = 0; i < output_tensors.size(); i++) { @@ -358,8 +383,8 @@ static PyMethodDef methods[] = { {"start", (PyCFunction)start, METH_VARARGS, "start(text) -> Start synthesizing the specified text, call next() repeatedly to get the audiodata." }, - {"next", (PyCFunction)next, METH_NOARGS, - "next() -> Return the next chunk of audio data (audio_data, num_samples, sample_rate, is_last). Here audio_data is a bytes object consisting of an array of floats in native endianness." + {"next", (PyCFunction)next, METH_VARARGS, + "next(as_16bit_samples=True) -> Return the next chunk of audio data (audio_data, num_samples, sample_rate, is_last). Here audio_data is a bytes object consisting of either native 16bit integer audio samples or native floats in the range [-1, 1]." }, {"set_espeak_voice_by_name", (PyCFunction)set_espeak_voice_by_name, METH_O,