mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Get the new piper backend basically working
This commit is contained in:
parent
7b19e19e29
commit
f259c80710
@ -77,7 +77,6 @@ initialize(PyObject *self, PyObject *args) {
|
||||
if (!normalize_data.func) return NULL;
|
||||
normalize_data.args = Py_BuildValue("(ss)", "NFD", "");
|
||||
if (!normalize_data.args) return NULL;
|
||||
Py_DECREF(unicodedata);
|
||||
}
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
@ -151,15 +150,15 @@ set_voice(PyObject *self, PyObject *args) {
|
||||
#define G(name, dest, conv) { \
|
||||
PyObject *sr = PyObject_GetAttrString(cfg, #name); \
|
||||
if (!sr) return NULL; \
|
||||
dest = conv(sr); \
|
||||
Py_CLEAR(sr); \
|
||||
dest = conv(sr); Py_CLEAR(sr); \
|
||||
if (PyErr_Occurred()) return NULL; \
|
||||
}
|
||||
G(sample_rate, current_sample_rate, PyLong_AsLong);
|
||||
G(num_speakers, current_num_speakers, PyLong_AsLong);
|
||||
G(length_scale, current_length_scale, PyFloat_AsDouble);
|
||||
G(noise_scale, current_noise_scale, PyFloat_AsDouble);
|
||||
G(noise_w, current_noise_w, PyFloat_AsDouble);
|
||||
G(sentence, current_sentence_delay, PyFloat_AsDouble);
|
||||
G(sentence_delay, current_sentence_delay, PyFloat_AsDouble);
|
||||
#undef G
|
||||
|
||||
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
|
||||
@ -168,14 +167,17 @@ set_voice(PyObject *self, PyObject *args) {
|
||||
PyObject *key, *value; Py_ssize_t pos = 0;
|
||||
while (PyDict_Next(map, &pos, &key, &value)) {
|
||||
unsigned long cp = PyLong_AsUnsignedLong(key);
|
||||
if (PyErr_Occurred()) break;
|
||||
std::vector<PhonemeId> ids;
|
||||
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
|
||||
unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
|
||||
if (PyErr_Occurred()) break;
|
||||
ids.push_back(id);
|
||||
}
|
||||
current_phoneme_id_map[cp] = ids;
|
||||
}
|
||||
Py_CLEAR(map);
|
||||
if (PyErr_Occurred()) return NULL;
|
||||
|
||||
// Load onnx model
|
||||
Py_BEGIN_ALLOW_THREADS;
|
||||
@ -369,8 +371,8 @@ next(PyObject *self, PyObject *args) {
|
||||
}
|
||||
if (data) {
|
||||
ans = Py_BuildValue(
|
||||
"NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
|
||||
phoneme_id_queue.empty() ? Py_True : Py_False);
|
||||
"OiiO", data, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False);
|
||||
Py_DECREF(data);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
|
@ -1,7 +1,14 @@
|
||||
#!/usr/bin/env python
|
||||
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import atexit
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from collections.abc import Callable
|
||||
from functools import partial
|
||||
from queue import Queue
|
||||
from threading import Lock, Thread
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import calibre_extensions.piper as piper
|
||||
@ -19,14 +26,14 @@ class VoiceConfig(NamedTuple):
|
||||
noise_scale: float
|
||||
noise_w: float
|
||||
num_speakers: int
|
||||
sentence_delay: float
|
||||
sentence_delay: float = 0
|
||||
|
||||
|
||||
def translate_voice_config(x: Any) -> VoiceConfig:
|
||||
phoneme_id_map: dict[int, list[int]] = {}
|
||||
for s, pid in x.get('phoneme_id_map', {}).items():
|
||||
for s, pids in x.get('phoneme_id_map', {}).items():
|
||||
if s:
|
||||
phoneme_id_map.setdefault(ord(s[0]), []).append(pid)
|
||||
phoneme_id_map.setdefault(ord(s[0]), []).extend(map(int, pids))
|
||||
inf = x.get('inference')
|
||||
|
||||
def g(d, prop, defval):
|
||||
@ -55,9 +62,159 @@ def espeak_data_dir() -> str:
|
||||
return '' # TODO: get the correct path when using frozen builds
|
||||
|
||||
|
||||
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
|
||||
piper.initialize(espeak_data_dir())
|
||||
def create_voice_config(config_path: str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> VoiceConfig:
|
||||
cfg = load_voice_config(config_path)
|
||||
m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1
|
||||
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
|
||||
return cfg
|
||||
|
||||
|
||||
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
|
||||
cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
|
||||
piper.set_voice(cfg, model_path)
|
||||
|
||||
|
||||
class SynthesisResult(NamedTuple):
|
||||
utterance_id: Any
|
||||
bytes_per_sample: int
|
||||
audio_data: bytes
|
||||
num_samples: int
|
||||
sample_rate: int
|
||||
is_last: bool
|
||||
|
||||
|
||||
class Piper(Thread):
|
||||
|
||||
def __init__(self):
|
||||
piper.initialize(espeak_data_dir())
|
||||
Thread.__init__(self, name='PiperSynth', daemon=True)
|
||||
self.commands = Queue()
|
||||
self.as_16bit_samples = True
|
||||
self._voice_id = 0
|
||||
self.lock = Lock()
|
||||
self.result_callback = lambda *a: None
|
||||
self.start()
|
||||
|
||||
@property
|
||||
def voice_id(self) -> int:
|
||||
with self.lock:
|
||||
ans = self._voice_id
|
||||
return ans
|
||||
|
||||
def increment_voice_id(self) -> int:
|
||||
with self.lock:
|
||||
self._voice_id += 1
|
||||
ans = self._voice_id
|
||||
return ans
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
voice_id, cmd = self.commands.get(True)
|
||||
if cmd is None:
|
||||
break
|
||||
if voice_id != self.voice_id:
|
||||
continue
|
||||
try:
|
||||
cmd()
|
||||
except Exception as e:
|
||||
import traceback
|
||||
self.result_callback(None, e, traceback.format_exc())
|
||||
|
||||
def shutdown(self):
|
||||
vid = self.increment_voice_id()
|
||||
self.commands.put((vid, None))
|
||||
self.join()
|
||||
|
||||
def set_voice(
|
||||
self, result_callback: Callable[[SynthesisResult, Exception|None, str|None], None],
|
||||
config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2,
|
||||
as_16bit_samples: bool = True,
|
||||
) -> int:
|
||||
vid = self.increment_voice_id()
|
||||
self.result_callback = result_callback
|
||||
self.as_16bit_samples = as_16bit_samples
|
||||
cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
|
||||
self.commands.put((vid, partial(self._set_voice, cfg, model_path)))
|
||||
return cfg.sample_rate
|
||||
|
||||
def _set_voice(self, cfg, model_path):
|
||||
piper.set_voice(cfg, model_path)
|
||||
|
||||
def cancel(self) -> None:
|
||||
self.increment_voice_id()
|
||||
self.result_callback = lambda *a: None
|
||||
|
||||
def synthesize(self, utterance_id: Any, text: str) -> None:
|
||||
vid = self.voice_id
|
||||
self.commands.put((vid, partial(self._synthesize, vid, utterance_id, text)))
|
||||
|
||||
def _synthesize(self, voice_id: int, utterance_id: Any, text: str) -> None:
|
||||
piper.start(text)
|
||||
bytes_per_sample = 2 if self.as_16bit_samples else 4
|
||||
while True:
|
||||
audio_data, num_samples, sample_rate, is_last = piper.next(self.as_16bit_samples)
|
||||
if self.voice_id == voice_id:
|
||||
self.result_callback(SynthesisResult(utterance_id, bytes_per_sample, audio_data, num_samples, sample_rate, is_last), None, None)
|
||||
else:
|
||||
break
|
||||
if is_last:
|
||||
break
|
||||
|
||||
|
||||
_global_piper_instance = None
|
||||
|
||||
|
||||
def global_piper_instance() -> Piper:
|
||||
global _global_piper_instance
|
||||
if _global_piper_instance is None:
|
||||
_global_piper_instance = Piper()
|
||||
atexit.register(_global_piper_instance.shutdown)
|
||||
return _global_piper_instance
|
||||
|
||||
|
||||
def play_wav_data(wav_data: bytes):
|
||||
from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl
|
||||
app = QCoreApplication([])
|
||||
m = QMediaPlayer()
|
||||
ao = QAudioOutput(m)
|
||||
m.setAudioOutput(ao)
|
||||
qbuffer = QBuffer()
|
||||
qbuffer.setData(QByteArray(wav_data))
|
||||
qbuffer.open(QIODevice.OpenModeFlag.ReadOnly)
|
||||
m.setSourceDevice(qbuffer, QUrl.fromLocalFile('piper.wav'))
|
||||
m.mediaStatusChanged.connect(
|
||||
lambda status: app.quit() if status == QMediaPlayer.MediaStatus.EndOfMedia else print(m.playbackState(), status)
|
||||
)
|
||||
m.errorOccurred.connect(lambda e, s: (print(e, s, file=sys.stderr), app.quit()))
|
||||
m.play()
|
||||
app.exec()
|
||||
|
||||
|
||||
def develop():
|
||||
from calibre.gui2.tts.piper import piper_cache_dir
|
||||
p = global_piper_instance()
|
||||
model_path = os.path.join(piper_cache_dir(), 'en_US-libritts-high.onnx')
|
||||
q = Queue()
|
||||
def synthesized(*args):
|
||||
q.put(args)
|
||||
sample_rate = p.set_voice(synthesized, model_path+'.json', model_path, sentence_delay=0.3)
|
||||
p.synthesize(1, 'Testing speech synthesis with piper. A second sentence.')
|
||||
all_data = []
|
||||
while (args := q.get()):
|
||||
sr, exc, tb = args
|
||||
if exc is not None:
|
||||
print(tb, file=sys.stderr, flush=True)
|
||||
print(exc, file=sys.stderr, flush=True)
|
||||
raise SystemExit(1)
|
||||
all_data.append(sr.audio_data)
|
||||
print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True)
|
||||
if sr.is_last:
|
||||
break
|
||||
from calibre_extensions.ffmpeg import wav_header_for_pcm_data
|
||||
pcm_data = b''.join(all_data)
|
||||
wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data
|
||||
play_wav_data(wav_data)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
develop()
|
||||
|
Loading…
x
Reference in New Issue
Block a user