diff --git a/src/calibre/constants.py b/src/calibre/constants.py
index 7ac71da553..dc7c0ff11f 100644
--- a/src/calibre/constants.py
+++ b/src/calibre/constants.py
@@ -507,19 +507,3 @@ def bundled_binaries_dir() -> str:
     if (islinux or isbsd) and getattr(sys, 'frozen', False):
         return os.path.join(sys.executables_location, 'bin')
     return ''
-
-
-@lru_cache(2)
-def piper_cmdline() -> tuple[str, ...]:
-    ext = '.exe' if iswindows else ''
-    if bbd := bundled_binaries_dir():
-        if ismacos:
-            return (os.path.join(sys.frameworks_dir, 'piper', 'piper'),)
-        return (os.path.join(bbd, 'piper', 'piper' + ext),)
-    if pd := os.environ.get('PIPER_TTS_DIR'):
-        return (os.path.join(pd, 'piper' + ext),)
-    import shutil
-    exe = shutil.which('piper-tts')
-    if exe:
-        return (exe,)
-    return ()
diff --git a/src/calibre/utils/tts/piper.cpp b/src/calibre/utils/tts/piper.cpp
index aa492ae76c..de363d26a4 100644
--- a/src/calibre/utils/tts/piper.cpp
+++ b/src/calibre/utils/tts/piper.cpp
@@ -8,6 +8,10 @@
 
 #include <Python.h>
 #include <espeak-ng/speak_lib.h>
+#include <vector>
+#include <map>
+#include <memory>
+#include <onnxruntime_cxx_api.h>
 
 #define CLAUSE_INTONATION_FULL_STOP 0x00000000
 #define CLAUSE_INTONATION_COMMA 0x00001000
@@ -25,27 +29,42 @@
 #define CLAUSE_COLON (30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE)
 #define CLAUSE_SEMICOLON (30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE)
 
+typedef char32_t Phoneme;
+typedef int64_t PhonemeId;
+typedef int64_t SpeakerId;
+typedef std::map<Phoneme, std::vector<PhonemeId>> PhonemeIdMap;
+
 static bool initialized = false, voice_set = false;
+static char espeak_data_dir[512] = {0};
+static PhonemeIdMap current_phoneme_id_map;
+static int current_sample_rate = 0;
+static int current_num_speakers = 1;
+static float current_length_scale = 1;
+static float current_noise_scale = 1;
+static float current_noise_w  = 1;
+std::unique_ptr<Ort::Session> session;
 
 static PyObject*
 initialize(PyObject *self, PyObject *args) {
-    const char *path = NULL;
+    const char *path = "";
     if (!PyArg_ParseTuple(args, "|s", &path)) return NULL;
-    if (initialized) { PyErr_SetString(PyExc_Exception, "initialize() already called"); return NULL; }
-    if (path && !path[0]) path = NULL;  // use default path
-    if (espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, path, 0) < 0) {
-        PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : "<default>");
-        return NULL;
+    if (!initialized || strcmp(espeak_data_dir, path) != 0) {
+        if (espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, path && path[0] ? path : NULL, 0) < 0) {
+            PyErr_Format(PyExc_ValueError, "Could not initialize espeak-ng with datadir: %s", path ? path : "<default>");
+            return NULL;
+        }
+        initialized = true;
+        snprintf(espeak_data_dir, sizeof(espeak_data_dir), "%s", path);
     }
     Py_RETURN_NONE;
 }
 
 static PyObject*
 set_espeak_voice_by_name(PyObject *self, PyObject *pyname) {
-    if (!PyUnicode_Check(pyname)) { PyErr_SetString(PyExc_TypeError, "name must be a unicode string"); return NULL; }
+    if (!PyUnicode_Check(pyname)) { PyErr_SetString(PyExc_TypeError, "espeak voice name must be a unicode string"); return NULL; }
     if (!initialized) { PyErr_SetString(PyExc_Exception, "must call initialize() first"); return NULL; }
     if (espeak_SetVoiceByName(PyUnicode_AsUTF8(pyname)) < 0) {
-        PyErr_Format(PyExc_ValueError, "failed to set voice: %U", pyname);
+        PyErr_Format(PyExc_ValueError, "failed to set espeak voice: %U", pyname);
         return NULL;
     }
     Py_RETURN_NONE;
@@ -86,12 +105,67 @@ phonemize(PyObject *self, PyObject *pytext) {
     return phonemes_and_terminators;
 }
 
+static PyObject*
+set_voice(PyObject *self, PyObject *args) {
+    PyObject *cfg; const char *model_path;
+    if (!PyArg_ParseTuple(args, "Os", &cfg, &model_path)) return NULL;
+
+    PyObject *evn = PyObject_GetAttrString(cfg, "espeak_voice_name");
+    if (!evn) return NULL;
+    PyObject *ret = set_espeak_voice_by_name(NULL, evn);
+    Py_CLEAR(evn);
+    if (ret == NULL) return NULL;
+    Py_DECREF(ret);
+
+#define G(name, dest, conv) { \
+        PyObject *sr = PyObject_GetAttrString(cfg, #name); \
+        if (!sr) return NULL; \
+        dest = conv(sr); \
+        Py_CLEAR(sr); \
+}
+    G(sample_rate, current_sample_rate, PyLong_AsLong);
+    G(num_speakers, current_num_speakers, PyLong_AsLong);
+    G(length_scale, current_length_scale, PyFloat_AsDouble);
+    G(noise_scale, current_noise_scale, PyFloat_AsDouble);
+    G(noise_w, current_noise_w, PyFloat_AsDouble);
+#undef G
+
+    PyObject *map = PyObject_GetAttrString(cfg, "phoneme_id_map");
+    if (!map) return NULL;
+    current_phoneme_id_map.clear();
+    PyObject *key, *value; Py_ssize_t pos = 0;
+    while (PyDict_Next(map, &pos, &key, &value)) {
+        unsigned long cp = PyLong_AsUnsignedLong(key);
+        std::vector<PhonemeId> ids;
+        for (Py_ssize_t i = 0; i < PyList_GET_SIZE(value); i++) {
+            unsigned long id = PyLong_AsUnsignedLong(PyList_GET_ITEM(value, i));
+            ids.push_back(id);
+        }
+        current_phoneme_id_map[cp] = ids;
+    }
+    Py_CLEAR(map);
+
+    // Load onnx model
+    Ort::SessionOptions opts;
+    opts.DisableCpuMemArena();
+    opts.DisableMemPattern();
+    opts.DisableProfiling();
+    Ort::Env ort_env{ORT_LOGGING_LEVEL_WARNING, "piper"};
+    session.reset();
+    session = std::make_unique<Ort::Session>(Ort::Session(ort_env, model_path, opts));
+
+    Py_RETURN_NONE;
+}
+
 // Boilerplate {{{
 static char doc[] = "Text to speech using the Piper TTS models";
 static PyMethodDef methods[] = {
     {"initialize", (PyCFunction)initialize, METH_VARARGS,
      "initialize(espeak_data_dir) -> Initialize this module. Must be called once before using any other functions from this module. If espeak_data_dir is not specified or is the mepty string the default data location is used."
     },
+    {"set_voice", (PyCFunction)set_voice, METH_VARARGS,
+     "set_voice(voice_config, model_path) -> Load the model in preparation for synthesis."
+    },
     {"set_espeak_voice_by_name", (PyCFunction)set_espeak_voice_by_name, METH_O,
      "set_espeak_voice_by_name(name) -> Set the voice to be used to phonemize text"
     },
@@ -117,6 +191,8 @@ cleanup_module(void*) {
         voice_set = false;
         espeak_Terminate();
     }
+    current_phoneme_id_map.clear();
+    session.reset();
 }
 
 CALIBRE_MODINIT_FUNC PyInit_piper(void) {
diff --git a/src/calibre/utils/tts/piper.py b/src/calibre/utils/tts/piper.py
index 88610e3f37..f51d584bc8 100644
--- a/src/calibre/utils/tts/piper.py
+++ b/src/calibre/utils/tts/piper.py
@@ -1,6 +1,63 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
 
+import json
+from typing import Any, NamedTuple
+
 import calibre_extensions.piper as piper
 
-piper
+DEFAULT_LENGTH_SCALE = 1.0
+DEFAULT_NOISE_SCALE = 0.667
+DEFAULT_NOISE_W_SCALE = 0.8
+
+
+class VoiceConfig(NamedTuple):
+    espeak_voice_name: str
+    sample_rate: int
+    phoneme_id_map: dict[int, list[int]]
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+    num_speakers: int
+
+    sentence_delay: float = 0
+
+
+def translate_voice_config(x: Any) -> VoiceConfig:
+    phoneme_id_map: dict[int, list[int]] = {}
+    for s, pid in x.get('phoneme_id_map', {}).items():
+        if s:
+            phoneme_id_map.setdefault(ord(s[0]), []).append(pid)
+    inf = x.get('inference')
+
+    def g(d, prop, defval):
+        ans = d.get(prop, VoiceConfig)
+        if ans is VoiceConfig:
+            ans = defval
+        return ans
+
+    return VoiceConfig(
+        espeak_voice_name=x.get('espeak', {}).get('voice') or 'en-us',
+        sample_rate=int(g(x.get('audio', {}), 'sample_rate', 22050)),
+        phoneme_id_map=phoneme_id_map,
+        length_scale=float(g(inf, 'length_scale', DEFAULT_LENGTH_SCALE)),
+        noise_scale=float(g(inf, 'noise_scale', DEFAULT_NOISE_SCALE)),
+        noise_w=float(g(inf, 'noise_w', DEFAULT_NOISE_W_SCALE)),
+        num_speakers=int(g(x, 'num_speakers', 1)),
+    )
+
+
+def load_voice_config(path: str) -> VoiceConfig:
+    with open(path, 'rb') as f:
+        return translate_voice_config(json.load(f))
+
+
+def espeak_data_dir() -> str:
+    return ''   # TODO: get the correct path when using frozen builds
+
+
+def set_voice(config_path: str, model_path:str, length_scale_multiplier: float, sentence_delay: float) -> None:
+    piper.initialize(espeak_data_dir())
+    cfg = load_voice_config(config_path)
+    cfg = cfg._replace(sentence_delay=sentence_delay, length_scale=cfg.length_scale * length_scale_multiplier)
+    piper.set_voice(cfg, model_path)