diff --git a/src/calibre/utils/tts/piper.cpp b/src/calibre/utils/tts/piper.cpp index 6a03bdadde..f56045297e 100644 --- a/src/calibre/utils/tts/piper.cpp +++ b/src/calibre/utils/tts/piper.cpp @@ -77,7 +77,6 @@ initialize(PyObject *self, PyObject *args) { if (!normalize_data.func) return NULL; normalize_data.args = Py_BuildValue("(ss)", "NFD", ""); if (!normalize_data.args) return NULL; - Py_DECREF(unicodedata); } Py_RETURN_NONE; } @@ -151,15 +150,15 @@ set_voice(PyObject *self, PyObject *args) { #define G(name, dest, conv) { \ PyObject *sr = PyObject_GetAttrString(cfg, #name); \ if (!sr) return NULL; \ - dest = conv(sr); \ - Py_CLEAR(sr); \ + dest = conv(sr); Py_CLEAR(sr); \ + if (PyErr_Occurred()) return NULL; \ } G(sample_rate, current_sample_rate, PyLong_AsLong); G(num_speakers, current_num_speakers, PyLong_AsLong); G(length_scale, current_length_scale, PyFloat_AsDouble); G(noise_scale, current_noise_scale, PyFloat_AsDouble); G(noise_w, current_noise_w, PyFloat_AsDouble); - G(sentence, current_sentence_delay, PyFloat_AsDouble); + G(sentence_delay, current_sentence_delay, PyFloat_AsDouble); #undef G PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map"); @@ -168,14 +167,17 @@ set_voice(PyObject *self, PyObject *args) { PyObject *key, *value; Py_ssize_t pos = 0; while (PyDict_Next(map, &pos, &key, &value)) { unsigned long cp = PyLong_AsUnsignedLong(key); + if (PyErr_Occurred()) break; std::vector ids; for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) { unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i)); + if (PyErr_Occurred()) break; ids.push_back(id); } current_phoneme_id_map[cp] = ids; } Py_CLEAR(map); + if (PyErr_Occurred()) return NULL; // Load onnx model Py_BEGIN_ALLOW_THREADS; @@ -369,8 +371,8 @@ next(PyObject *self, PyObject *args) { } if (data) { ans = Py_BuildValue( - "NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate, - phoneme_id_queue.empty() ? Py_True : Py_False); + "OiiO", data, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False); + Py_DECREF(data); } // Clean up diff --git a/src/calibre/utils/tts/piper.py b/src/calibre/utils/tts/piper.py index 35a97eb4be..aff3cd9c7e 100644 --- a/src/calibre/utils/tts/piper.py +++ b/src/calibre/utils/tts/piper.py @@ -1,7 +1,14 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2025, Kovid Goyal +import atexit import json +import os +import sys +from collections.abc import Callable +from functools import partial +from queue import Queue +from threading import Lock, Thread from typing import Any, NamedTuple import calibre_extensions.piper as piper @@ -19,14 +26,14 @@ class VoiceConfig(NamedTuple): noise_scale: float noise_w: float num_speakers: int - sentence_delay: float + sentence_delay: float = 0 def translate_voice_config(x: Any) -> VoiceConfig: phoneme_id_map: dict[int, list[int]] = {} - for s, pid in x.get('phoneme_id_map', {}).items(): + for s, pids in x.get('phoneme_id_map', {}).items(): if s: - phoneme_id_map.setdefault(ord(s[0]), []).append(pid) + phoneme_id_map.setdefault(ord(s[0]), []).extend(map(int, pids)) inf = x.get('inference') def g(d, prop, defval): @@ -55,9 +62,159 @@ def espeak_data_dir() -> str: return '' # TODO: get the correct path when using frozen builds -def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None: - piper.initialize(espeak_data_dir()) +def create_voice_config(config_path: str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> VoiceConfig: cfg = load_voice_config(config_path) m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1 cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m) + return cfg + + +def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None: + cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay) piper.set_voice(cfg, model_path) + + +class SynthesisResult(NamedTuple): + utterance_id: Any + bytes_per_sample: int + audio_data: bytes + num_samples: int + sample_rate: int + is_last: bool + + +class Piper(Thread): + + def __init__(self): + piper.initialize(espeak_data_dir()) + Thread.__init__(self, name='PiperSynth', daemon=True) + self.commands = Queue() + self.as_16bit_samples = True + self._voice_id = 0 + self.lock = Lock() + self.result_callback = lambda *a: None + self.start() + + @property + def voice_id(self) -> int: + with self.lock: + ans = self._voice_id + return ans + + def increment_voice_id(self) -> int: + with self.lock: + self._voice_id += 1 + ans = self._voice_id + return ans + + def run(self): + while True: + voice_id, cmd = self.commands.get(True) + if cmd is None: + break + if voice_id != self.voice_id: + continue + try: + cmd() + except Exception as e: + import traceback + self.result_callback(None, e, traceback.format_exc()) + + def shutdown(self): + vid = self.increment_voice_id() + self.commands.put((vid, None)) + self.join() + + def set_voice( + self, result_callback: Callable[[SynthesisResult, Exception|None, str|None], None], + config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2, + as_16bit_samples: bool = True, + ) -> int: + vid = self.increment_voice_id() + self.result_callback = result_callback + self.as_16bit_samples = as_16bit_samples + cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay) + self.commands.put((vid, partial(self._set_voice, cfg, model_path))) + return cfg.sample_rate + + def _set_voice(self, cfg, model_path): + piper.set_voice(cfg, model_path) + + def cancel(self) -> None: + self.increment_voice_id() + self.result_callback = lambda *a: None + + def synthesize(self, utterance_id: Any, text: str) -> None: + vid = self.voice_id + self.commands.put((vid, partial(self._synthesize, vid, utterance_id, text))) + + def _synthesize(self, voice_id: int, utterance_id: Any, text: str) -> None: + piper.start(text) + bytes_per_sample = 2 if self.as_16bit_samples else 4 + while True: + audio_data, num_samples, sample_rate, is_last = piper.next(self.as_16bit_samples) + if self.voice_id == voice_id: + self.result_callback(SynthesisResult(utterance_id, bytes_per_sample, audio_data, num_samples, sample_rate, is_last), None, None) + else: + break + if is_last: + break + + +_global_piper_instance = None + + +def global_piper_instance() -> Piper: + global _global_piper_instance + if _global_piper_instance is None: + _global_piper_instance = Piper() + atexit.register(_global_piper_instance.shutdown) + return _global_piper_instance + + +def play_wav_data(wav_data: bytes): + from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl + app = QCoreApplication([]) + m = QMediaPlayer() + ao = QAudioOutput(m) + m.setAudioOutput(ao) + qbuffer = QBuffer() + qbuffer.setData(QByteArray(wav_data)) + qbuffer.open(QIODevice.OpenModeFlag.ReadOnly) + m.setSourceDevice(qbuffer, QUrl.fromLocalFile('piper.wav')) + m.mediaStatusChanged.connect( + lambda status: app.quit() if status == QMediaPlayer.MediaStatus.EndOfMedia else print(m.playbackState(), status) + ) + m.errorOccurred.connect(lambda e, s: (print(e, s, file=sys.stderr), app.quit())) + m.play() + app.exec() + + +def develop(): + from calibre.gui2.tts.piper import piper_cache_dir + p = global_piper_instance() + model_path = os.path.join(piper_cache_dir(), 'en_US-libritts-high.onnx') + q = Queue() + def synthesized(*args): + q.put(args) + sample_rate = p.set_voice(synthesized, model_path+'.json', model_path, sentence_delay=0.3) + p.synthesize(1, 'Testing speech synthesis with piper. A second sentence.') + all_data = [] + while (args := q.get()): + sr, exc, tb = args + if exc is not None: + print(tb, file=sys.stderr, flush=True) + print(exc, file=sys.stderr, flush=True) + raise SystemExit(1) + all_data.append(sr.audio_data) + print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True) + if sr.is_last: + break + from calibre_extensions.ffmpeg import wav_header_for_pcm_data + pcm_data = b''.join(all_data) + wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data + play_wav_data(wav_data) + + +if __name__ == '__main__': + develop()