mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Get the new piper backend basically working
This commit is contained in:
parent
7b19e19e29
commit
f259c80710
@ -77,7 +77,6 @@ initialize(PyObject *self, PyObject *args) {
|
|||||||
if (!normalize_data.func) return NULL;
|
if (!normalize_data.func) return NULL;
|
||||||
normalize_data.args = Py_BuildValue("(ss)", "NFD", "");
|
normalize_data.args = Py_BuildValue("(ss)", "NFD", "");
|
||||||
if (!normalize_data.args) return NULL;
|
if (!normalize_data.args) return NULL;
|
||||||
Py_DECREF(unicodedata);
|
|
||||||
}
|
}
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
@ -151,15 +150,15 @@ set_voice(PyObject *self, PyObject *args) {
|
|||||||
#define G(name, dest, conv) { \
|
#define G(name, dest, conv) { \
|
||||||
PyObject *sr = PyObject_GetAttrString(cfg, #name); \
|
PyObject *sr = PyObject_GetAttrString(cfg, #name); \
|
||||||
if (!sr) return NULL; \
|
if (!sr) return NULL; \
|
||||||
dest = conv(sr); \
|
dest = conv(sr); Py_CLEAR(sr); \
|
||||||
Py_CLEAR(sr); \
|
if (PyErr_Occurred()) return NULL; \
|
||||||
}
|
}
|
||||||
G(sample_rate, current_sample_rate, PyLong_AsLong);
|
G(sample_rate, current_sample_rate, PyLong_AsLong);
|
||||||
G(num_speakers, current_num_speakers, PyLong_AsLong);
|
G(num_speakers, current_num_speakers, PyLong_AsLong);
|
||||||
G(length_scale, current_length_scale, PyFloat_AsDouble);
|
G(length_scale, current_length_scale, PyFloat_AsDouble);
|
||||||
G(noise_scale, current_noise_scale, PyFloat_AsDouble);
|
G(noise_scale, current_noise_scale, PyFloat_AsDouble);
|
||||||
G(noise_w, current_noise_w, PyFloat_AsDouble);
|
G(noise_w, current_noise_w, PyFloat_AsDouble);
|
||||||
G(sentence, current_sentence_delay, PyFloat_AsDouble);
|
G(sentence_delay, current_sentence_delay, PyFloat_AsDouble);
|
||||||
#undef G
|
#undef G
|
||||||
|
|
||||||
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
|
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
|
||||||
@ -168,14 +167,17 @@ set_voice(PyObject *self, PyObject *args) {
|
|||||||
PyObject *key, *value; Py_ssize_t pos = 0;
|
PyObject *key, *value; Py_ssize_t pos = 0;
|
||||||
while (PyDict_Next(map, &pos, &key, &value)) {
|
while (PyDict_Next(map, &pos, &key, &value)) {
|
||||||
unsigned long cp = PyLong_AsUnsignedLong(key);
|
unsigned long cp = PyLong_AsUnsignedLong(key);
|
||||||
|
if (PyErr_Occurred()) break;
|
||||||
std::vector<PhonemeId> ids;
|
std::vector<PhonemeId> ids;
|
||||||
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
|
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
|
||||||
unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
|
unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
|
||||||
|
if (PyErr_Occurred()) break;
|
||||||
ids.push_back(id);
|
ids.push_back(id);
|
||||||
}
|
}
|
||||||
current_phoneme_id_map[cp] = ids;
|
current_phoneme_id_map[cp] = ids;
|
||||||
}
|
}
|
||||||
Py_CLEAR(map);
|
Py_CLEAR(map);
|
||||||
|
if (PyErr_Occurred()) return NULL;
|
||||||
|
|
||||||
// Load onnx model
|
// Load onnx model
|
||||||
Py_BEGIN_ALLOW_THREADS;
|
Py_BEGIN_ALLOW_THREADS;
|
||||||
@ -369,8 +371,8 @@ next(PyObject *self, PyObject *args) {
|
|||||||
}
|
}
|
||||||
if (data) {
|
if (data) {
|
||||||
ans = Py_BuildValue(
|
ans = Py_BuildValue(
|
||||||
"NiiO", data, sizeof(float)*num_samples, num_samples, current_sample_rate,
|
"OiiO", data, num_samples, current_sample_rate, phoneme_id_queue.empty() ? Py_True : Py_False);
|
||||||
phoneme_id_queue.empty() ? Py_True : Py_False);
|
Py_DECREF(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up
|
// Clean up
|
||||||
|
@ -1,7 +1,14 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
import atexit
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections.abc import Callable
|
||||||
|
from functools import partial
|
||||||
|
from queue import Queue
|
||||||
|
from threading import Lock, Thread
|
||||||
from typing import Any, NamedTuple
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
import calibre_extensions.piper as piper
|
import calibre_extensions.piper as piper
|
||||||
@ -19,14 +26,14 @@ class VoiceConfig(NamedTuple):
|
|||||||
noise_scale: float
|
noise_scale: float
|
||||||
noise_w: float
|
noise_w: float
|
||||||
num_speakers: int
|
num_speakers: int
|
||||||
sentence_delay: float
|
sentence_delay: float = 0
|
||||||
|
|
||||||
|
|
||||||
def translate_voice_config(x: Any) -> VoiceConfig:
|
def translate_voice_config(x: Any) -> VoiceConfig:
|
||||||
phoneme_id_map: dict[int, list[int]] = {}
|
phoneme_id_map: dict[int, list[int]] = {}
|
||||||
for s, pid in x.get('phoneme_id_map', {}).items():
|
for s, pids in x.get('phoneme_id_map', {}).items():
|
||||||
if s:
|
if s:
|
||||||
phoneme_id_map.setdefault(ord(s[0]), []).append(pid)
|
phoneme_id_map.setdefault(ord(s[0]), []).extend(map(int, pids))
|
||||||
inf = x.get('inference')
|
inf = x.get('inference')
|
||||||
|
|
||||||
def g(d, prop, defval):
|
def g(d, prop, defval):
|
||||||
@ -55,9 +62,159 @@ def espeak_data_dir() -> str:
|
|||||||
return '' # TODO: get the correct path when using frozen builds
|
return '' # TODO: get the correct path when using frozen builds
|
||||||
|
|
||||||
|
|
||||||
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
|
def create_voice_config(config_path: str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> VoiceConfig:
|
||||||
piper.initialize(espeak_data_dir())
|
|
||||||
cfg = load_voice_config(config_path)
|
cfg = load_voice_config(config_path)
|
||||||
m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1
|
m = max(0.1, 1 + -1 * max(-1, min(length_scale_multiplier, 1))) # maps -1 to 1 to 2 to 0.1
|
||||||
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
|
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * m)
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2) -> None:
|
||||||
|
cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
|
||||||
piper.set_voice(cfg, model_path)
|
piper.set_voice(cfg, model_path)
|
||||||
|
|
||||||
|
|
||||||
|
class SynthesisResult(NamedTuple):
|
||||||
|
utterance_id: Any
|
||||||
|
bytes_per_sample: int
|
||||||
|
audio_data: bytes
|
||||||
|
num_samples: int
|
||||||
|
sample_rate: int
|
||||||
|
is_last: bool
|
||||||
|
|
||||||
|
|
||||||
|
class Piper(Thread):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
piper.initialize(espeak_data_dir())
|
||||||
|
Thread.__init__(self, name='PiperSynth', daemon=True)
|
||||||
|
self.commands = Queue()
|
||||||
|
self.as_16bit_samples = True
|
||||||
|
self._voice_id = 0
|
||||||
|
self.lock = Lock()
|
||||||
|
self.result_callback = lambda *a: None
|
||||||
|
self.start()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def voice_id(self) -> int:
|
||||||
|
with self.lock:
|
||||||
|
ans = self._voice_id
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def increment_voice_id(self) -> int:
|
||||||
|
with self.lock:
|
||||||
|
self._voice_id += 1
|
||||||
|
ans = self._voice_id
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while True:
|
||||||
|
voice_id, cmd = self.commands.get(True)
|
||||||
|
if cmd is None:
|
||||||
|
break
|
||||||
|
if voice_id != self.voice_id:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
cmd()
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
self.result_callback(None, e, traceback.format_exc())
|
||||||
|
|
||||||
|
def shutdown(self):
|
||||||
|
vid = self.increment_voice_id()
|
||||||
|
self.commands.put((vid, None))
|
||||||
|
self.join()
|
||||||
|
|
||||||
|
def set_voice(
|
||||||
|
self, result_callback: Callable[[SynthesisResult, Exception|None, str|None], None],
|
||||||
|
config_path: str, model_path:str, length_scale_multiplier: float = 0, sentence_delay: float = 0.2,
|
||||||
|
as_16bit_samples: bool = True,
|
||||||
|
) -> int:
|
||||||
|
vid = self.increment_voice_id()
|
||||||
|
self.result_callback = result_callback
|
||||||
|
self.as_16bit_samples = as_16bit_samples
|
||||||
|
cfg = create_voice_config(config_path, length_scale_multiplier, sentence_delay)
|
||||||
|
self.commands.put((vid, partial(self._set_voice, cfg, model_path)))
|
||||||
|
return cfg.sample_rate
|
||||||
|
|
||||||
|
def _set_voice(self, cfg, model_path):
|
||||||
|
piper.set_voice(cfg, model_path)
|
||||||
|
|
||||||
|
def cancel(self) -> None:
|
||||||
|
self.increment_voice_id()
|
||||||
|
self.result_callback = lambda *a: None
|
||||||
|
|
||||||
|
def synthesize(self, utterance_id: Any, text: str) -> None:
|
||||||
|
vid = self.voice_id
|
||||||
|
self.commands.put((vid, partial(self._synthesize, vid, utterance_id, text)))
|
||||||
|
|
||||||
|
def _synthesize(self, voice_id: int, utterance_id: Any, text: str) -> None:
|
||||||
|
piper.start(text)
|
||||||
|
bytes_per_sample = 2 if self.as_16bit_samples else 4
|
||||||
|
while True:
|
||||||
|
audio_data, num_samples, sample_rate, is_last = piper.next(self.as_16bit_samples)
|
||||||
|
if self.voice_id == voice_id:
|
||||||
|
self.result_callback(SynthesisResult(utterance_id, bytes_per_sample, audio_data, num_samples, sample_rate, is_last), None, None)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if is_last:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
_global_piper_instance = None
|
||||||
|
|
||||||
|
|
||||||
|
def global_piper_instance() -> Piper:
|
||||||
|
global _global_piper_instance
|
||||||
|
if _global_piper_instance is None:
|
||||||
|
_global_piper_instance = Piper()
|
||||||
|
atexit.register(_global_piper_instance.shutdown)
|
||||||
|
return _global_piper_instance
|
||||||
|
|
||||||
|
|
||||||
|
def play_wav_data(wav_data: bytes):
|
||||||
|
from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl
|
||||||
|
app = QCoreApplication([])
|
||||||
|
m = QMediaPlayer()
|
||||||
|
ao = QAudioOutput(m)
|
||||||
|
m.setAudioOutput(ao)
|
||||||
|
qbuffer = QBuffer()
|
||||||
|
qbuffer.setData(QByteArray(wav_data))
|
||||||
|
qbuffer.open(QIODevice.OpenModeFlag.ReadOnly)
|
||||||
|
m.setSourceDevice(qbuffer, QUrl.fromLocalFile('piper.wav'))
|
||||||
|
m.mediaStatusChanged.connect(
|
||||||
|
lambda status: app.quit() if status == QMediaPlayer.MediaStatus.EndOfMedia else print(m.playbackState(), status)
|
||||||
|
)
|
||||||
|
m.errorOccurred.connect(lambda e, s: (print(e, s, file=sys.stderr), app.quit()))
|
||||||
|
m.play()
|
||||||
|
app.exec()
|
||||||
|
|
||||||
|
|
||||||
|
def develop():
|
||||||
|
from calibre.gui2.tts.piper import piper_cache_dir
|
||||||
|
p = global_piper_instance()
|
||||||
|
model_path = os.path.join(piper_cache_dir(), 'en_US-libritts-high.onnx')
|
||||||
|
q = Queue()
|
||||||
|
def synthesized(*args):
|
||||||
|
q.put(args)
|
||||||
|
sample_rate = p.set_voice(synthesized, model_path+'.json', model_path, sentence_delay=0.3)
|
||||||
|
p.synthesize(1, 'Testing speech synthesis with piper. A second sentence.')
|
||||||
|
all_data = []
|
||||||
|
while (args := q.get()):
|
||||||
|
sr, exc, tb = args
|
||||||
|
if exc is not None:
|
||||||
|
print(tb, file=sys.stderr, flush=True)
|
||||||
|
print(exc, file=sys.stderr, flush=True)
|
||||||
|
raise SystemExit(1)
|
||||||
|
all_data.append(sr.audio_data)
|
||||||
|
print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True)
|
||||||
|
if sr.is_last:
|
||||||
|
break
|
||||||
|
from calibre_extensions.ffmpeg import wav_header_for_pcm_data
|
||||||
|
pcm_data = b''.join(all_data)
|
||||||
|
wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data
|
||||||
|
play_wav_data(wav_data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
develop()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user