Get the new piper backend basically working

2025-07-31 14:33:54 -04:00 · 2025-07-29 10:52:42 +05:30 · 2025-07-29 10:52:42 +05:30 · f259c80710
commit f259c80710
parent 7b19e19e29
2 changed files with 170 additions and 11 deletions
--- a/src/calibre/utils/tts/piper.cpp
+++ b/src/calibre/utils/tts/piper.cpp
@ -77,7 +77,6 @@ initialize(PyObject *self, PyObject *args) {
        if (!normalize_data.func) return NULL;
        normalize_data.args = Py_BuildValue("(ss)", "NFD", "");
        if (!normalize_data.args) return NULL;
-    Py_DECREF(unicodedata);
    }
    Py_RETURN_NONE;
 }
@ -151,15 +150,15 @@ set_voice(PyObject *self, PyObject *args) {
 #define G(name, dest, conv) { \
        PyObject *sr = PyObject_GetAttrString(cfg, #name); \
        if (!sr) return NULL; \
-        dest = conv(sr); \
-        Py_CLEAR(sr); \
+        dest = conv(sr); Py_CLEAR(sr); \
+        if (PyErr_Occurred()) return NULL; \
 }
    G(sample_rate, current_sample_rate, PyLong_AsLong);
    G(num_speakers, current_num_speakers, PyLong_AsLong);
    G(length_scale, current_length_scale, PyFloat_AsDouble);
    G(noise_scale, current_noise_scale, PyFloat_AsDouble);
    G(noise_w, current_noise_w, PyFloat_AsDouble);
-    G(sentence, current_sentence_delay, PyFloat_AsDouble);
+    G(sentence_delay, current_sentence_delay, PyFloat_AsDouble);
 #undef G

    PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
@ -168,14 +167,17 @@ set_voice(PyObject *self, PyObject *args) {
    PyObject *key, *value; Py_ssize_t pos = 0;
    while (PyDict_Next(map, &pos, &key, &value)) {
        unsigned long cp = PyLong_AsUnsignedLong(key);
+        if (PyErr_Occurred()) break;
        std::vector<PhonemeId> ids;
        for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
            unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
+            if (PyErr_Occurred()) break;
            ids.push_back(id);
        }
        current_phoneme_id_map[cp] = ids;
    }
    Py_CLEAR(map);
+    if (PyErr_Occurred()) return NULL;

    // Load onnx model
    Py_BEGIN_ALLOW_THREADS;
@ -369,8 +371,8 @@ next(PyObject *self, PyObject *args) {
    }
    if (data) {
        ans = Py_BuildValue(
-            "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
-            phoneme_id_queue.empty() ? Py_True : Py_False);
+            "OiiO", data, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False);
+        Py_DECREF(data);
    }

    // Clean up
--- a/src/calibre/utils/tts/piper.py
+++ b/src/calibre/utils/tts/piper.py
@ -1,7 +1,14 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>

+import atexit
 import json
+import os
+import sys
+from collections.abc import Callable
+from functools import partial
+from queue import Queue
+from threading import Lock, Thread
 from typing import Any, NamedTuple

 import calibre_extensions.piper as piper
@ -19,14 +26,14 @@ class VoiceConfig(NamedTuple):
    noise_scale: float
    noise_w: float
    num_speakers: int
-    sentence_delay: float
+    sentence_delay: float = 0


 def translate_voice_config(x: Any) -> VoiceConfig:
    phoneme_id_map: dict[int, list[int]] = {}
-    for s, pid in x.get('phoneme_id_map', {}).items():
+    for s, pids in x.get('phoneme_id_map', {}).items():
        if s:
-            phoneme_id_map.setdefault(ord(s[0]), []).append(pid)
+            phoneme_id_map.setdefault(ord(s[0]), []).extend(map(int, pids))
    inf = x.get('inference')

    def g(d, prop, defval):
@ -55,9 +62,159 @@ def espeak_data_dir() -> str:
    return ''   # TODO: get the correct path when using frozen builds


-def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
-    piper.initialize(espeak_data_dir())
+def create_voice_config(config_path: str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> VoiceConfig:
    cfg = load_voice_config(config_path)
    m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1)))  # maps -1 to 1 to 2 to 0.1
    cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
+    return cfg
+
+
+def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
+    cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
    piper.set_voice(cfg, model_path)
+
+
+class SynthesisResult(NamedTuple):
+    utterance_id: Any
+    bytes_per_sample: int
+    audio_data: bytes
+    num_samples: int
+    sample_rate: int
+    is_last: bool
+
+
+class Piper(Thread):
+
+    def __init__(self):
+        piper.initialize(espeak_data_dir())
+        Thread.__init__(self, name='PiperSynth', daemon=True)
+        self.commands = Queue()
+        self.as_16bit_samples = True
+        self._voice_id = 0
+        self.lock = Lock()
+        self.result_callback = lambda *a: None
+        self.start()
+
+    @property
+    def voice_id(self) -> int:
+        with self.lock:
+            ans = self._voice_id
+        return ans
+
+    def increment_voice_id(self) -> int:
+        with self.lock:
+            self._voice_id += 1
+            ans = self._voice_id
+        return ans
+
+    def run(self):
+        while True:
+            voice_id, cmd = self.commands.get(True)
+            if cmd is None:
+                break
+            if voice_id != self.voice_id:
+                continue
+            try:
+                cmd()
+            except Exception as e:
+                import traceback
+                self.result_callback(None, e, traceback.format_exc())
+
+    def shutdown(self):
+        vid = self.increment_voice_id()
+        self.commands.put((vid, None))
+        self.join()
+
+    def set_voice(
+        self, result_callback: Callable[[SynthesisResult, Exception|None, str|None], None],
+        config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2,
+        as_16bit_samples: bool = True,
+    ) -> int:
+        vid = self.increment_voice_id()
+        self.result_callback = result_callback
+        self.as_16bit_samples = as_16bit_samples
+        cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
+        self.commands.put((vid, partial(self._set_voice, cfg, model_path)))
+        return cfg.sample_rate
+
+    def _set_voice(self, cfg, model_path):
+        piper.set_voice(cfg, model_path)
+
+    def cancel(self) -> None:
+        self.increment_voice_id()
+        self.result_callback = lambda *a: None
+
+    def synthesize(self, utterance_id: Any, text: str) -> None:
+        vid = self.voice_id
+        self.commands.put((vid, partial(self._synthesize, vid, utterance_id, text)))
+
+    def _synthesize(self, voice_id: int, utterance_id: Any, text: str) -> None:
+        piper.start(text)
+        bytes_per_sample = 2 if self.as_16bit_samples else 4
+        while True:
+            audio_data, num_samples, sample_rate, is_last = piper.next(self.as_16bit_samples)
+            if self.voice_id == voice_id:
+                self.result_callback(SynthesisResult(utterance_id, bytes_per_sample, audio_data, num_samples, sample_rate, is_last), None, None)
+            else:
+                break
+            if is_last:
+                break
+
+
+_global_piper_instance = None
+
+
+def global_piper_instance() -> Piper:
+    global _global_piper_instance
+    if _global_piper_instance is None:
+        _global_piper_instance = Piper()
+        atexit.register(_global_piper_instance.shutdown)
+    return _global_piper_instance
+
+
+def play_wav_data(wav_data: bytes):
+    from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl
+    app = QCoreApplication([])
+    m = QMediaPlayer()
+    ao = QAudioOutput(m)
+    m.setAudioOutput(ao)
+    qbuffer = QBuffer()
+    qbuffer.setData(QByteArray(wav_data))
+    qbuffer.open(QIODevice.OpenModeFlag.ReadOnly)
+    m.setSourceDevice(qbuffer, QUrl.fromLocalFile('piper.wav'))
+    m.mediaStatusChanged.connect(
+        lambda status: app.quit() if status == QMediaPlayer.MediaStatus.EndOfMedia else print(m.playbackState(), status)
+    )
+    m.errorOccurred.connect(lambda e, s: (print(e, s, file=sys.stderr), app.quit()))
+    m.play()
+    app.exec()
+
+
+def develop():
+    from calibre.gui2.tts.piper import piper_cache_dir
+    p = global_piper_instance()
+    model_path = os.path.join(piper_cache_dir(), 'en_US-libritts-high.onnx')
+    q = Queue()
+    def synthesized(*args):
+        q.put(args)
+    sample_rate = p.set_voice(synthesized, model_path+'.json', model_path, sentence_delay=0.3)
+    p.synthesize(1, 'Testing speech synthesis with piper. A second sentence.')
+    all_data = []
+    while (args := q.get()):
+        sr, exc, tb = args
+        if exc is not None:
+            print(tb, file=sys.stderr, flush=True)
+            print(exc, file=sys.stderr, flush=True)
+            raise SystemExit(1)
+        all_data.append(sr.audio_data)
+        print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True)
+        if sr.is_last:
+            break
+    from calibre_extensions.ffmpeg import wav_header_for_pcm_data
+    pcm_data = b''.join(all_data)
+    wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data
+    play_wav_data(wav_data)
+
+
+if __name__ == '__main__':
+    develop()