mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Function to load ONNX model from piper options
This commit is contained in:
parent
d2ead9eaaa
commit
b6bfc6c66f
@ -507,19 +507,3 @@ def bundled_binaries_dir() -> str:
|
|||||||
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
||||||
return os.path.join(sys.executables_location, 'bin')
|
return os.path.join(sys.executables_location, 'bin')
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(2)
|
|
||||||
def piper_cmdline() -> tuple[str, ...]:
|
|
||||||
ext = '.exe' if iswindows else ''
|
|
||||||
if bbd := bundled_binaries_dir():
|
|
||||||
if ismacos:
|
|
||||||
return (os.path.join(sys.frameworks_dir, 'piper', 'piper'),)
|
|
||||||
return (os.path.join(bbd, 'piper', 'piper' + ext),)
|
|
||||||
if pd := os.environ.get('PIPER_TTS_DIR'):
|
|
||||||
return (os.path.join(pd, 'piper' + ext),)
|
|
||||||
import shutil
|
|
||||||
exe = shutil.which('piper-tts')
|
|
||||||
if exe:
|
|
||||||
return (exe,)
|
|
||||||
return ()
|
|
||||||
|
@ -8,6 +8,10 @@
|
|||||||
|
|
||||||
#include <Python.h>
|
#include <Python.h>
|
||||||
#include <espeak-ng/speak_lib.h>
|
#include <espeak-ng/speak_lib.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <onnxruntime_cxx_api.h>
|
||||||
|
|
||||||
#define CLAUSE_INTONATION_FULL_STOP 0x00000000
|
#define CLAUSE_INTONATION_FULL_STOP 0x00000000
|
||||||
#define CLAUSE_INTONATION_COMMA 0x00001000
|
#define CLAUSE_INTONATION_COMMA 0x00001000
|
||||||
@ -25,27 +29,42 @@
|
|||||||
#define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE)
|
#define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE)
|
||||||
#define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE)
|
#define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE)
|
||||||
|
|
||||||
|
typedef char32_t Phoneme;
|
||||||
|
typedef int64_t PhonemeId;
|
||||||
|
typedef int64_t SpeakerId;
|
||||||
|
typedef std::map<Phoneme, std::vector<PhonemeId>> PhonemeIdMap;
|
||||||
|
|
||||||
static bool initialized = false, voice_set = false;
|
static bool initialized = false, voice_set = false;
|
||||||
|
static char espeak_data_dir[512] = {0};
|
||||||
|
static PhonemeIdMap current_phoneme_id_map;
|
||||||
|
static int current_sample_rate = 0;
|
||||||
|
static int current_num_speakers = 1;
|
||||||
|
static float current_length_scale = 1;
|
||||||
|
static float current_noise_scale = 1;
|
||||||
|
static float current_noise_w = 1;
|
||||||
|
std::unique_ptr<Ort::Session> session;
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
initialize(PyObject *self, PyObject *args) {
|
initialize(PyObject *self, PyObject *args) {
|
||||||
const char *path = NULL;
|
const char *path = "";
|
||||||
if (!PyArg_ParseTuple(args, "|s", &path)) return NULL;
|
if (!PyArg_ParseTuple(args, "|s", &path)) return NULL;
|
||||||
if (initialized) { PyErr_SetString(PyExc_Exception, "initialize() already called"); return NULL; }
|
if (!initialized || strcmp(espeak_data_dir, path) != 0) {
|
||||||
if (path && !path[0]) path = NULL; // use default path
|
if (espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, path && path[0] ? path : NULL, 0) < 0) {
|
||||||
if (espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, path, 0) < 0) {
|
PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : "<default>");
|
||||||
PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : "<default>");
|
return NULL;
|
||||||
return NULL;
|
}
|
||||||
|
initialized = true;
|
||||||
|
snprintf(espeak_data_dir, sizeof(espeak_data_dir), "%s", path);
|
||||||
}
|
}
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
set_espeak_voice_by_name(PyObject *self, PyObject *pyname) {
|
set_espeak_voice_by_name(PyObject *self, PyObject *pyname) {
|
||||||
if (!PyUnicode_Check(pyname)) { PyErr_SetString(PyExc_TypeError, "name must be a unicode string"); return NULL; }
|
if (!PyUnicode_Check(pyname)) { PyErr_SetString(PyExc_TypeError, "espeak voice name must be a unicode string"); return NULL; }
|
||||||
if (!initialized) { PyErr_SetString(PyExc_Exception, "must call initialize() first"); return NULL; }
|
if (!initialized) { PyErr_SetString(PyExc_Exception, "must call initialize() first"); return NULL; }
|
||||||
if (espeak_SetVoiceByName(PyUnicode_AsUTF8(pyname)) < 0) {
|
if (espeak_SetVoiceByName(PyUnicode_AsUTF8(pyname)) < 0) {
|
||||||
PyErr_Format(PyExc_ValueError, "failed to set voice: %U", pyname);
|
PyErr_Format(PyExc_ValueError, "failed to set espeak voice: %U", pyname);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
@ -86,12 +105,67 @@ phonemize(PyObject *self, PyObject *pytext) {
|
|||||||
return phonemes_and_terminators;
|
return phonemes_and_terminators;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
set_voice(PyObject *self, PyObject *args) {
|
||||||
|
PyObject *cfg; const char *model_path;
|
||||||
|
if (!PyArg_ParseTuple(args, "Os", &cfg, &model_path)) return NULL;
|
||||||
|
|
||||||
|
PyObject *evn = PyObject_GetAttrString(cfg, "espeak_voice_name");
|
||||||
|
if (!evn) return NULL;
|
||||||
|
PyObject *ret = set_espeak_voice_by_name(NULL, evn);
|
||||||
|
Py_CLEAR(evn);
|
||||||
|
if (ret == NULL) return NULL;
|
||||||
|
Py_DECREF(ret);
|
||||||
|
|
||||||
|
#define G(name, dest, conv) { \
|
||||||
|
PyObject *sr = PyObject_GetAttrString(cfg, #name); \
|
||||||
|
if (!sr) return NULL; \
|
||||||
|
dest = conv(sr); \
|
||||||
|
Py_CLEAR(sr); \
|
||||||
|
}
|
||||||
|
G(sample_rate, current_sample_rate, PyLong_AsLong);
|
||||||
|
G(num_speakers, current_num_speakers, PyLong_AsLong);
|
||||||
|
G(length_scale, current_length_scale, PyFloat_AsDouble);
|
||||||
|
G(noise_scale, current_noise_scale, PyFloat_AsDouble);
|
||||||
|
G(noise_w, current_noise_w, PyFloat_AsDouble);
|
||||||
|
#undef G
|
||||||
|
|
||||||
|
PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
|
||||||
|
if (!map) return NULL;
|
||||||
|
current_phoneme_id_map.clear();
|
||||||
|
PyObject *key, *value; Py_ssize_t pos = 0;
|
||||||
|
while (PyDict_Next(map, &pos, &key, &value)) {
|
||||||
|
unsigned long cp = PyLong_AsUnsignedLong(key);
|
||||||
|
std::vector<PhonemeId> ids;
|
||||||
|
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
|
||||||
|
unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
|
||||||
|
ids.push_back(id);
|
||||||
|
}
|
||||||
|
current_phoneme_id_map[cp] = ids;
|
||||||
|
}
|
||||||
|
Py_CLEAR(map);
|
||||||
|
|
||||||
|
// Load onnx model
|
||||||
|
Ort::SessionOptions opts;
|
||||||
|
opts.DisableCpuMemArena();
|
||||||
|
opts.DisableMemPattern();
|
||||||
|
opts.DisableProfiling();
|
||||||
|
Ort::Env ort_env{ORT_LOGGING_LEVEL_WARNING, "piper"};
|
||||||
|
session.reset();
|
||||||
|
session = std::make_unique<Ort::Session>(Ort::Session(ort_env, model_path, opts));
|
||||||
|
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
// Boilerplate {{{
|
// Boilerplate {{{
|
||||||
static char doc[] = "Text to speech using the Piper TTS models";
|
static char doc[] = "Text to speech using the Piper TTS models";
|
||||||
static PyMethodDef methods[] = {
|
static PyMethodDef methods[] = {
|
||||||
{"initialize", (PyCFunction)initialize, METH_VARARGS,
|
{"initialize", (PyCFunction)initialize, METH_VARARGS,
|
||||||
"initialize(espeak_data_dir) -> Initialize this module. Must be called once before using any other functions from this module. If espeak_data_dir is not specified or is the mepty string the default data location is used."
|
"initialize(espeak_data_dir) -> Initialize this module. Must be called once before using any other functions from this module. If espeak_data_dir is not specified or is the mepty string the default data location is used."
|
||||||
},
|
},
|
||||||
|
{"set_voice", (PyCFunction)set_voice, METH_VARARGS,
|
||||||
|
"set_voice(voice_config, model_path) -> Load the model in preparation for synthesis."
|
||||||
|
},
|
||||||
{"set_espeak_voice_by_name", (PyCFunction)set_espeak_voice_by_name, METH_O,
|
{"set_espeak_voice_by_name", (PyCFunction)set_espeak_voice_by_name, METH_O,
|
||||||
"set_espeak_voice_by_name(name) -> Set the voice to be used to phonemize text"
|
"set_espeak_voice_by_name(name) -> Set the voice to be used to phonemize text"
|
||||||
},
|
},
|
||||||
@ -117,6 +191,8 @@ cleanup_module(void*) {
|
|||||||
voice_set = false;
|
voice_set = false;
|
||||||
espeak_Terminate();
|
espeak_Terminate();
|
||||||
}
|
}
|
||||||
|
current_phoneme_id_map.clear();
|
||||||
|
session.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
CALIBRE_MODINIT_FUNC PyInit_piper(void) {
|
CALIBRE_MODINIT_FUNC PyInit_piper(void) {
|
||||||
|
@ -1,6 +1,63 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
import calibre_extensions.piper as piper
|
import calibre_extensions.piper as piper
|
||||||
|
|
||||||
piper
|
DEFAULT_LENGTH_SCALE = 1.0
|
||||||
|
DEFAULT_NOISE_SCALE = 0.667
|
||||||
|
DEFAULT_NOISE_W_SCALE = 0.8
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceConfig(NamedTuple):
|
||||||
|
espeak_voice_name: str
|
||||||
|
sample_rate: int
|
||||||
|
phoneme_id_map: dict[int, list[int]]
|
||||||
|
length_scale: float
|
||||||
|
noise_scale: float
|
||||||
|
noise_w: float
|
||||||
|
num_speakers: int
|
||||||
|
|
||||||
|
sentence_delay: float = 0
|
||||||
|
|
||||||
|
|
||||||
|
def translate_voice_config(x: Any) -> VoiceConfig:
|
||||||
|
phoneme_id_map: dict[int, list[int]] = {}
|
||||||
|
for s, pid in x.get('phoneme_id_map', {}).items():
|
||||||
|
if s:
|
||||||
|
phoneme_id_map.setdefault(ord(s[0]), []).append(pid)
|
||||||
|
inf = x.get('inference')
|
||||||
|
|
||||||
|
def g(d, prop, defval):
|
||||||
|
ans = d.get(prop, VoiceConfig)
|
||||||
|
if ans is VoiceConfig:
|
||||||
|
ans = defval
|
||||||
|
return ans
|
||||||
|
|
||||||
|
return VoiceConfig(
|
||||||
|
espeak_voice_name=x.get('espeak', {}).get('voice') or 'en-us',
|
||||||
|
sample_rate=int(g(x.get('audio', {}), 'sample_rate', 22050)),
|
||||||
|
phoneme_id_map=phoneme_id_map,
|
||||||
|
length_scale=float(g(inf, 'length_scale', DEFAULT_LENGTH_SCALE)),
|
||||||
|
noise_scale=float(g(inf, 'noise_scale', DEFAULT_NOISE_SCALE)),
|
||||||
|
noise_w=float(g(inf, 'noise_w', DEFAULT_NOISE_W_SCALE)),
|
||||||
|
num_speakers=int(g(x, 'num_speakers', 1)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_voice_config(path: str) -> VoiceConfig:
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
return translate_voice_config(json.load(f))
|
||||||
|
|
||||||
|
|
||||||
|
def espeak_data_dir() -> str:
|
||||||
|
return '' # TODO: get the correct path when using frozen builds
|
||||||
|
|
||||||
|
|
||||||
|
def set_voice(config_path: str, model_path:str, length_scale_multiplier: float, sentence_delay: float) -> None:
|
||||||
|
piper.initialize(espeak_data_dir())
|
||||||
|
cfg = load_voice_config(config_path)
|
||||||
|
cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * length_scale_multiplier)
|
||||||
|
piper.set_voice(cfg, model_path)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user