Get the new piper backend basically working

This commit is contained in:
Kovid Goyal 2025-07-29 10:52:42 +05:30
parent 7b19e19e29
commit f259c80710
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 170 additions and 11 deletions

View File

@ -77,7 +77,6 @@ initialize(PyObject *self, PyObject *args) {
if (!normalize_data.func) return NULL; if (!normalize_data.func) return NULL;
normalize_data.args = Py_BuildValue("(ss)", "NFD", ""); normalize_data.args = Py_BuildValue("(ss)", "NFD", "");
if (!normalize_data.args) return NULL; if (!normalize_data.args) return NULL;
Py_DECREF(unicodedata);
} }
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -151,15 +150,15 @@ set_voice(PyObject *self, PyObject *args) {
#define G(name, dest, conv) { \ #define G(name, dest, conv) { \
PyObject *sr = PyObject_GetAttrString(cfg, #name); \ PyObject *sr = PyObject_GetAttrString(cfg, #name); \
if (!sr) return NULL; \ if (!sr) return NULL; \
dest = conv(sr); \ dest = conv(sr); Py_CLEAR(sr); \
Py_CLEAR(sr); \ if (PyErr_Occurred()) return NULL; \
} }
G(sample_rate, current_sample_rate, PyLong_AsLong); G(sample_rate, current_sample_rate, PyLong_AsLong);
G(num_speakers, current_num_speakers, PyLong_AsLong); G(num_speakers, current_num_speakers, PyLong_AsLong);
G(length_scale, current_length_scale, PyFloat_AsDouble); G(length_scale, current_length_scale, PyFloat_AsDouble);
G(noise_scale, current_noise_scale, PyFloat_AsDouble); G(noise_scale, current_noise_scale, PyFloat_AsDouble);
G(noise_w, current_noise_w, PyFloat_AsDouble); G(noise_w, current_noise_w, PyFloat_AsDouble);
G(sentence, current_sentence_delay, PyFloat_AsDouble); G(sentence_delay, current_sentence_delay, PyFloat_AsDouble);
#undef G #undef G
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map"); PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
@ -168,14 +167,17 @@ set_voice(PyObject *self, PyObject *args) {
PyObject *key, *value; Py_ssize_t pos = 0; PyObject *key, *value; Py_ssize_t pos = 0;
while (PyDict_Next(map, &pos, &key, &value)) { while (PyDict_Next(map, &pos, &key, &value)) {
unsigned long cp = PyLong_AsUnsignedLong(key); unsigned long cp = PyLong_AsUnsignedLong(key);
if (PyErr_Occurred()) break;
std::vector<PhonemeId> ids; std::vector<PhonemeId> ids;
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) { for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i)); unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
if (PyErr_Occurred()) break;
ids.push_back(id); ids.push_back(id);
} }
current_phoneme_id_map[cp] = ids; current_phoneme_id_map[cp] = ids;
} }
Py_CLEAR(map); Py_CLEAR(map);
if (PyErr_Occurred()) return NULL;
// Load onnx model // Load onnx model
Py_BEGIN_ALLOW_THREADS; Py_BEGIN_ALLOW_THREADS;
@ -369,8 +371,8 @@ next(PyObject *self, PyObject *args) {
} }
if (data) { if (data) {
ans = Py_BuildValue( ans = Py_BuildValue(
"NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate, "OiiO", data, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False);
phoneme_id_queue.empty() ? Py_True : Py_False); Py_DECREF(data);
} }
// Clean up // Clean up

View File

@ -1,7 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
import atexit
import json import json
import os
import sys
from collections.abc import Callable
from functools import partial
from queue import Queue
from threading import Lock, Thread
from typing import Any, NamedTuple from typing import Any, NamedTuple
import calibre_extensions.piper as piper import calibre_extensions.piper as piper
@ -19,14 +26,14 @@ class VoiceConfig(NamedTuple):
noise_scale: float noise_scale: float
noise_w: float noise_w: float
num_speakers: int num_speakers: int
sentence_delay: float sentence_delay: float = 0
def translate_voice_config(x: Any) -> VoiceConfig: def translate_voice_config(x: Any) -> VoiceConfig:
phoneme_id_map: dict[int, list[int]] = {} phoneme_id_map: dict[int, list[int]] = {}
for s, pid in x.get('phoneme_id_map', {}).items(): for s, pids in x.get('phoneme_id_map', {}).items():
if s: if s:
phoneme_id_map.setdefault(ord(s[0]), []).append(pid) phoneme_id_map.setdefault(ord(s[0]), []).extend(map(int, pids))
inf = x.get('inference') inf = x.get('inference')
def g(d, prop, defval): def g(d, prop, defval):
@ -55,9 +62,159 @@ def espeak_data_dir() -> str:
return '' # TODO: get the correct path when using frozen builds return '' # TODO: get the correct path when using frozen builds
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None: def create_voice_config(config_path: str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> VoiceConfig:
piper.initialize(espeak_data_dir())
cfg = load_voice_config(config_path) cfg = load_voice_config(config_path)
m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1 m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m) cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
return cfg
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
piper.set_voice(cfg, model_path) piper.set_voice(cfg, model_path)
class SynthesisResult(NamedTuple):
utterance_id: Any
bytes_per_sample: int
audio_data: bytes
num_samples: int
sample_rate: int
is_last: bool
class Piper(Thread):
def __init__(self):
piper.initialize(espeak_data_dir())
Thread.__init__(self, name='PiperSynth', daemon=True)
self.commands = Queue()
self.as_16bit_samples = True
self._voice_id = 0
self.lock = Lock()
self.result_callback = lambda *a: None
self.start()
@property
def voice_id(self) -> int:
with self.lock:
ans = self._voice_id
return ans
def increment_voice_id(self) -> int:
with self.lock:
self._voice_id += 1
ans = self._voice_id
return ans
def run(self):
while True:
voice_id, cmd = self.commands.get(True)
if cmd is None:
break
if voice_id != self.voice_id:
continue
try:
cmd()
except Exception as e:
import traceback
self.result_callback(None, e, traceback.format_exc())
def shutdown(self):
vid = self.increment_voice_id()
self.commands.put((vid, None))
self.join()
def set_voice(
self, result_callback: Callable[[SynthesisResult, Exception|None, str|None], None],
config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2,
as_16bit_samples: bool = True,
) -> int:
vid = self.increment_voice_id()
self.result_callback = result_callback
self.as_16bit_samples = as_16bit_samples
cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
self.commands.put((vid, partial(self._set_voice, cfg, model_path)))
return cfg.sample_rate
def _set_voice(self, cfg, model_path):
piper.set_voice(cfg, model_path)
def cancel(self) -> None:
self.increment_voice_id()
self.result_callback = lambda *a: None
def synthesize(self, utterance_id: Any, text: str) -> None:
vid = self.voice_id
self.commands.put((vid, partial(self._synthesize, vid, utterance_id, text)))
def _synthesize(self, voice_id: int, utterance_id: Any, text: str) -> None:
piper.start(text)
bytes_per_sample = 2 if self.as_16bit_samples else 4
while True:
audio_data, num_samples, sample_rate, is_last = piper.next(self.as_16bit_samples)
if self.voice_id == voice_id:
self.result_callback(SynthesisResult(utterance_id, bytes_per_sample, audio_data, num_samples, sample_rate, is_last), None, None)
else:
break
if is_last:
break
_global_piper_instance = None
def global_piper_instance() -> Piper:
global _global_piper_instance
if _global_piper_instance is None:
_global_piper_instance = Piper()
atexit.register(_global_piper_instance.shutdown)
return _global_piper_instance
def play_wav_data(wav_data: bytes):
from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl
app = QCoreApplication([])
m = QMediaPlayer()
ao = QAudioOutput(m)
m.setAudioOutput(ao)
qbuffer = QBuffer()
qbuffer.setData(QByteArray(wav_data))
qbuffer.open(QIODevice.OpenModeFlag.ReadOnly)
m.setSourceDevice(qbuffer, QUrl.fromLocalFile('piper.wav'))
m.mediaStatusChanged.connect(
lambda status: app.quit() if status == QMediaPlayer.MediaStatus.EndOfMedia else print(m.playbackState(), status)
)
m.errorOccurred.connect(lambda e, s: (print(e, s, file=sys.stderr), app.quit()))
m.play()
app.exec()
def develop():
from calibre.gui2.tts.piper import piper_cache_dir
p = global_piper_instance()
model_path = os.path.join(piper_cache_dir(), 'en_US-libritts-high.onnx')
q = Queue()
def synthesized(*args):
q.put(args)
sample_rate = p.set_voice(synthesized, model_path+'.json', model_path, sentence_delay=0.3)
p.synthesize(1, 'Testing speech synthesis with piper. A second sentence.')
all_data = []
while (args := q.get()):
sr, exc, tb = args
if exc is not None:
print(tb, file=sys.stderr, flush=True)
print(exc, file=sys.stderr, flush=True)
raise SystemExit(1)
all_data.append(sr.audio_data)
print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True)
if sr.is_last:
break
from calibre_extensions.ffmpeg import wav_header_for_pcm_data
pcm_data = b''.join(all_data)
wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data
play_wav_data(wav_data)
if __name__ == '__main__':
develop()