Get the new piper backend basically working

2025-07-31 14:33:54 -04:00 · 2025-07-29 10:52:42 +05:30 · 2025-07-29 10:52:42 +05:30 · f259c80710
commit f259c80710
parent 7b19e19e29
2 changed files with 170 additions and 11 deletions
--- a/src/calibre/utils/tts/piper.cpp
+++ b/src/calibre/utils/tts/piper.cpp
@ -77,7 +77,6 @@ initialize(PyObject *self, PyObject *args) {
        if (!normalize_data.func) return NULL;
        normalize_data.args = Py_BuildValue("(ss)", "NFD", "");
        if (!normalize_data.args) return NULL;
    Py_DECREF(unicodedata);
    }
    Py_RETURN_NONE;
 }
@ -151,15 +150,15 @@ set_voice(PyObject *self, PyObject *args) {
 #define G(name, dest, conv) { \
        PyObject *sr = PyObject_GetAttrString(cfg, #name); \
        if (!sr) return NULL; \
-        dest = conv(sr); \
+        dest = conv(sr); Py_CLEAR(sr); \
-        Py_CLEAR(sr); \
+        if (PyErr_Occurred()) return NULL; \
 }
    G(sample_rate, current_sample_rate, PyLong_AsLong);
    G(num_speakers, current_num_speakers, PyLong_AsLong);
    G(length_scale, current_length_scale, PyFloat_AsDouble);
    G(noise_scale, current_noise_scale, PyFloat_AsDouble);
    G(noise_w, current_noise_w, PyFloat_AsDouble);
-    G(sentence, current_sentence_delay, PyFloat_AsDouble);
+    G(sentence_delay, current_sentence_delay, PyFloat_AsDouble);
 #undef G
    PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
@ -168,14 +167,17 @@ set_voice(PyObject *self, PyObject *args) {
    PyObject *key, *value; Py_ssize_t pos = 0;
    while (PyDict_Next(map, &pos, &key, &value)) {
        unsigned long cp = PyLong_AsUnsignedLong(key);
        if (PyErr_Occurred()) break;
        std::vector<PhonemeId> ids;
        for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
            unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
            if (PyErr_Occurred()) break;
            ids.push_back(id);
        }
        current_phoneme_id_map[cp] = ids;
    }
    Py_CLEAR(map);
    if (PyErr_Occurred()) return NULL;
    // Load onnx model
    Py_BEGIN_ALLOW_THREADS;
@ -369,8 +371,8 @@ next(PyObject *self, PyObject *args) {
    }
    if (data) {
        ans = Py_BuildValue(
-            "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
+            "OiiO", data, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False);
-            phoneme_id_queue.empty() ? Py_True : Py_False);
+        Py_DECREF(data);
    }
    // Clean up
--- a/src/calibre/utils/tts/piper.py
+++ b/src/calibre/utils/tts/piper.py
@ -1,7 +1,14 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
 import atexit
 import json
 import os
 import sys
 from collections.abc import Callable
 from functools import partial
 from queue import Queue
 from threading import Lock, Thread
 from typing import Any, NamedTuple
 import calibre_extensions.piper as piper
@ -19,14 +26,14 @@ class VoiceConfig(NamedTuple):
    noise_scale: float
    noise_w: float
    num_speakers: int
-    sentence_delay: float
+    sentence_delay: float = 0
 def translate_voice_config(x: Any) -> VoiceConfig:
    phoneme_id_map: dict[int, list[int]] = {}
-    for s, pid in x.get('phoneme_id_map', {}).items():
+    for s, pids in x.get('phoneme_id_map', {}).items():
        if s:
-            phoneme_id_map.setdefault(ord(s[0]), []).append(pid)
+            phoneme_id_map.setdefault(ord(s[0]), []).extend(map(int, pids))
    inf = x.get('inference')
    def g(d, prop, defval):
@ -55,9 +62,159 @@ def espeak_data_dir() -> str:
    return ''   # TODO: get the correct path when using frozen builds
-def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
+def create_voice_config(config_path: str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> VoiceConfig:
    piper.initialize(espeak_data_dir())
    cfg = load_voice_config(config_path)
    m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1)))  # maps -1 to 1 to 2 to 0.1
    cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
    return cfg
 def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
    cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
    piper.set_voice(cfg, model_path)
 class SynthesisResult(NamedTuple):
    utterance_id: Any
    bytes_per_sample: int
    audio_data: bytes
    num_samples: int
    sample_rate: int
    is_last: bool
 class Piper(Thread):
    def __init__(self):
        piper.initialize(espeak_data_dir())
        Thread.__init__(self, name='PiperSynth', daemon=True)
        self.commands = Queue()
        self.as_16bit_samples = True
        self._voice_id = 0
        self.lock = Lock()
        self.result_callback = lambda *a: None
        self.start()
    @property
    def voice_id(self) -> int:
        with self.lock:
            ans = self._voice_id
        return ans
    def increment_voice_id(self) -> int:
        with self.lock:
            self._voice_id += 1
            ans = self._voice_id
        return ans
    def run(self):
        while True:
            voice_id, cmd = self.commands.get(True)
            if cmd is None:
                break
            if voice_id != self.voice_id:
                continue
            try:
                cmd()
            except Exception as e:
                import traceback
                self.result_callback(None, e, traceback.format_exc())
    def shutdown(self):
        vid = self.increment_voice_id()
        self.commands.put((vid, None))
        self.join()
    def set_voice(
        self, result_callback: Callable[[SynthesisResult, Exception|None, str|None], None],
        config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2,
        as_16bit_samples: bool = True,
    ) -> int:
        vid = self.increment_voice_id()
        self.result_callback = result_callback
        self.as_16bit_samples = as_16bit_samples
        cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
        self.commands.put((vid, partial(self._set_voice, cfg, model_path)))
        return cfg.sample_rate
    def _set_voice(self, cfg, model_path):
        piper.set_voice(cfg, model_path)
    def cancel(self) -> None:
        self.increment_voice_id()
        self.result_callback = lambda *a: None
    def synthesize(self, utterance_id: Any, text: str) -> None:
        vid = self.voice_id
        self.commands.put((vid, partial(self._synthesize, vid, utterance_id, text)))
    def _synthesize(self, voice_id: int, utterance_id: Any, text: str) -> None:
        piper.start(text)
        bytes_per_sample = 2 if self.as_16bit_samples else 4
        while True:
            audio_data, num_samples, sample_rate, is_last = piper.next(self.as_16bit_samples)
            if self.voice_id == voice_id:
                self.result_callback(SynthesisResult(utterance_id, bytes_per_sample, audio_data, num_samples, sample_rate, is_last), None, None)
            else:
                break
            if is_last:
                break
 _global_piper_instance = None
 def global_piper_instance() -> Piper:
    global _global_piper_instance
    if _global_piper_instance is None:
        _global_piper_instance = Piper()
        atexit.register(_global_piper_instance.shutdown)
    return _global_piper_instance
 def play_wav_data(wav_data: bytes):
    from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl
    app = QCoreApplication([])
    m = QMediaPlayer()
    ao = QAudioOutput(m)
    m.setAudioOutput(ao)
    qbuffer = QBuffer()
    qbuffer.setData(QByteArray(wav_data))
    qbuffer.open(QIODevice.OpenModeFlag.ReadOnly)
    m.setSourceDevice(qbuffer, QUrl.fromLocalFile('piper.wav'))
    m.mediaStatusChanged.connect(
        lambda status: app.quit() if status == QMediaPlayer.MediaStatus.EndOfMedia else print(m.playbackState(), status)
    )
    m.errorOccurred.connect(lambda e, s: (print(e, s, file=sys.stderr), app.quit()))
    m.play()
    app.exec()
 def develop():
    from calibre.gui2.tts.piper import piper_cache_dir
    p = global_piper_instance()
    model_path = os.path.join(piper_cache_dir(), 'en_US-libritts-high.onnx')
    q = Queue()
    def synthesized(*args):
        q.put(args)
    sample_rate = p.set_voice(synthesized, model_path+'.json', model_path, sentence_delay=0.3)
    p.synthesize(1, 'Testing speech synthesis with piper. A second sentence.')
    all_data = []
    while (args := q.get()):
        sr, exc, tb = args
        if exc is not None:
            print(tb, file=sys.stderr, flush=True)
            print(exc, file=sys.stderr, flush=True)
            raise SystemExit(1)
        all_data.append(sr.audio_data)
        print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True)
        if sr.is_last:
            break
    from calibre_extensions.ffmpeg import wav_header_for_pcm_data
    pcm_data = b''.join(all_data)
    wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data
    play_wav_data(wav_data)
 if __name__ == '__main__':
    develop()