From 661499e391cc212964fe5ab1bb027e467b0ee0bd Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 29 Jul 2025 13:14:40 +0530
Subject: [PATCH] Move piper backend to use new code

---
 src/calibre/gui2/tts/piper.py  | 384 +++++++--------------------------
 src/calibre/gui2/tts/types.py  |  13 +-
 src/calibre/test_build.py      |   9 +-
 src/calibre/utils/tts/piper.py |  21 +-
 4 files changed, 105 insertions(+), 322 deletions(-)
diff --git a/src/calibre/gui2/tts/piper.py b/src/calibre/gui2/tts/piper.py
index 3033ba6e92..ed25a33c38 100644
--- a/src/calibre/gui2/tts/piper.py
+++ b/src/calibre/gui2/tts/piper.py
@@ -2,44 +2,26 @@
 # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
 
 import atexit
-import io
 import json
 import os
-import re
 import sys
 from collections import deque
 from collections.abc import Iterable, Iterator
 from contextlib import suppress
 from dataclasses import dataclass
-from functools import lru_cache
 from itertools import count
 from time import monotonic
-from typing import BinaryIO
 
-from qt.core import (
-    QAudio,
-    QAudioFormat,
-    QAudioSink,
-    QByteArray,
-    QIODevice,
-    QIODeviceBase,
-    QMediaDevices,
-    QObject,
-    QProcess,
-    Qt,
-    QTextToSpeech,
-    QWidget,
-    pyqtSignal,
-    sip,
-)
+from qt.core import QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QMediaDevices, QObject, Qt, QTextToSpeech, QWidget, pyqtSignal, sip
 
-from calibre.constants import cache_dir, is_debugging, iswindows, piper_cmdline
+from calibre.constants import cache_dir, is_debugging, iswindows
 from calibre.gui2 import error_dialog
 from calibre.gui2.tts.types import TTS_EMBEDED_CONFIG, EngineSpecificSettings, Quality, TTSBackend, Voice, widget_parent
 from calibre.spell.break_iterator import PARAGRAPH_SEPARATOR, split_into_sentences_for_tts
 from calibre.utils.filenames import ascii_text
 from calibre.utils.localization import canonicalize_lang, get_lang
 from calibre.utils.resources import get_path as P
+from calibre.utils.tts.piper import SynthesisResult, global_piper_instance, global_piper_instance_if_exists, play_pcm_data
 
 HIGH_QUALITY_SAMPLE_RATE = 22050
 
@@ -60,20 +42,14 @@ def audio_format(audio_rate: int = HIGH_QUALITY_SAMPLE_RATE) -> QAudioFormat:
     return fmt
 
 
-def piper_process_metadata(model_path, config_path, s: EngineSpecificSettings, voice: Voice) -> tuple[int, list[str]]:
+def piper_process_metadata(callback, model_path, config_path, s: EngineSpecificSettings, voice: Voice) -> int:
     if not model_path:
         raise Exception('Could not download voice data')
     if 'metadata' not in voice.engine_data:
         with open(config_path) as f:
             voice.engine_data['metadata'] = json.load(f)
-    audio_rate = voice.engine_data['metadata']['audio']['sample_rate']
-    length_scale = max(0.1, 1 + -1 * s.rate)  # maps -1 to 1 to 2 to 0.1
-    cmdline = list(piper_cmdline()) + [
-        '--model', model_path, '--config', config_path, '--output-raw', '--json-input',
-        '--sentence-silence', str(s.sentence_delay), '--length_scale', str(length_scale)]
-    if is_debugging():
-        cmdline.append('--debug')
-    return audio_rate, cmdline
+    return global_piper_instance().set_voice(
+        callback, config_path, model_path, length_scale_multiplier=s.rate, sentence_delay=s.sentence_delay)
 
 
 def piper_cache_dir() -> str:
@@ -149,8 +125,7 @@ class Utterance:
     id: int
     start: int
     length: int
-    payload_size: int
-    left_to_write: QByteArray
+    sentence: str
     audio_data: QByteArray
 
     started: bool = False
@@ -241,47 +216,19 @@ class UtteranceAudioQueue(QIODevice):
 
 def split_into_utterances(text: str, counter: count, lang: str = 'en'):
     for start, sentence in split_into_sentences_for_tts(text, lang):
-        payload = json.dumps({'text': sentence}).encode('utf-8')
-        ba = QByteArray()
-        ba.reserve(len(payload) + 1)
-        ba.append(payload)
-        ba.append(UTTERANCE_SEPARATOR)
-        u = Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(),
-                        left_to_write=ba, start=start, length=len(sentence))
+        u = Utterance(id=next(counter), audio_data=QByteArray(), sentence=sentence, start=start, length=len(sentence))
         debug(f'Utterance created {u.id} {start=}: {sentence!r}')
         yield u
 
 
-@lru_cache(2)
-def stderr_pat():
-    return re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
-
-
-def detect_end_of_data(data: bytes, callback):
-    lines = data.split(b'\n')
-    for line in lines[:-1]:
-        if m := stderr_pat().search(line):
-            which, payload = m.group(1), m.group(2)
-            if which == b'info':
-                debug(f'[piper-info] {payload.decode("utf-8", "replace")}')
-                if payload.startswith(b'Real-time factor:'):
-                    callback(True, None)
-            elif which == b'error':
-                callback(False, payload.decode('utf-8', 'replace'))
-            elif which == b'debug':
-                debug(f'[piper-debug] {payload.decode("utf-8", "replace")}')
-    return lines[-1]
-
-
 class Piper(TTSBackend):
 
     engine_name: str = 'piper'
     filler_char: str = PARAGRAPH_SEPARATOR
-    _synthesis_done = pyqtSignal()
+    _synthesis_done = pyqtSignal(object, object, object)
 
     def __init__(self, engine_name: str = '', parent: QObject | None = None):
         super().__init__(parent)
-        self._process: QProcess | None = None
         self._audio_sink: QAudioSink | None = None
 
         self._current_voice: Voice | None = None
@@ -296,7 +243,7 @@ class Piper(TTSBackend):
         self._errors_from_piper: list[str] = []
         self._pending_stderr_data = b''
 
-        self._synthesis_done.connect(self._utterance_synthesized, type=Qt.ConnectionType.QueuedConnection)
+        self._synthesis_done.connect(self._on_synthesis_done, type=Qt.ConnectionType.QueuedConnection)
         atexit.register(self.shutdown)
 
     @property
@@ -304,27 +251,16 @@ class Piper(TTSBackend):
         self._load_voice_metadata()
         return {'': self._voices}
 
-    def _wait_for_process_to_start(self) -> bool:
-        if not self.process.waitForStarted():
-            cmdline = [self.process.program()] + self.process.arguments()
-            if self.process.error() is QProcess.ProcessError.TimedOut:
-                self._set_error(f'Timed out waiting for piper process {cmdline} to start')
-            else:
-                self._set_error(f'Failed to start piper process: {cmdline}')
-            return False
-        return True
-
     def say(self, text: str) -> None:
         if self._last_error:
             return
         self.stop()
-        if not self._wait_for_process_to_start():
-            return
+        self.ensure_started()
         lang = 'en'
         if self._current_voice and self._current_voice.language_code:
             lang = self._current_voice.language_code
         self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter, lang))
-        self._write_current_utterance()
+        self._queue_current_utterance()
 
     def pause(self) -> None:
         if self._audio_sink is not None:
@@ -335,29 +271,27 @@ class Piper(TTSBackend):
             self._audio_sink.resume()
 
     def stop(self) -> None:
-        if self._process is not None:
+        if self._audio_sink is not None:
             if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken:
                 self.shutdown()
-                # We cannot re-create self.process here as that will cause the
+                # We cannot call ensure_started() here as that will cause the
                 # audio device to go to active state which will cause a
                 # speaking event to be generated
 
     def shutdown(self) -> None:
-        if self._process is not None:
+        if self._audio_sink is not None:
+            gp = global_piper_instance_if_exists()
+            if gp is not None:
+                gp.cancel()
             self._audio_sink.stateChanged.disconnect()
-            self._process.readyReadStandardError.disconnect()
-            self._process.bytesWritten.disconnect()
-            self._process.readyReadStandardOutput.disconnect()
-            self._process.stateChanged.disconnect()
-            self._process.kill()
-            self._process.waitForFinished(-1)
             # this dance is needed otherwise stop() is very slow on Linux
             self._audio_sink.suspend()
             self._audio_sink.reset()
             self._audio_sink.stop()
             sip.delete(self._audio_sink)
-            sip.delete(self._process)
-            self._process = self._audio_sink = None
+            self._audio_sink = None
+            self._utterances_being_synthesized.clear()
+            self._utterances_being_spoken.clear()
             self._set_state(QTextToSpeech.State.Ready)
 
     def reload_after_configure(self) -> None:
@@ -379,9 +313,8 @@ class Piper(TTSBackend):
         self._last_error = msg
         self._set_state(QTextToSpeech.State.Error)
 
-    @property
-    def process(self) -> QProcess:
-        if self._process is None:
+    def ensure_started(self) -> None:
+        if self._audio_sink is None:
             model_path = config_path = ''
             try:
                 self._load_voice_metadata()
@@ -394,18 +327,9 @@ class Piper(TTSBackend):
             self._utterances_being_spoken.clear()
             self._utterances_being_synthesized.clear()
             self._errors_from_piper.clear()
-            self._process = QProcess(self)
-            self._pending_stderr_data = b''
             self._set_state(QTextToSpeech.State.Ready)
 
-            audio_rate, cmdline = piper_process_metadata(model_path, config_path, s, voice)
-            self._process.setProgram(cmdline[0])
-            self._process.setArguments(cmdline[1:])
-            debug('Running piper:', cmdline)
-            self._process.readyReadStandardError.connect(self.piper_stderr_available)
-            self._process.readyReadStandardOutput.connect(self.piper_stdout_available)
-            self._process.bytesWritten.connect(self.bytes_written)
-            self._process.stateChanged.connect(self._update_status)
+            audio_rate = piper_process_metadata(self.on_synthesis_done, model_path, config_path, s, voice)
             fmt = audio_format(audio_rate)
             dev = None
             if s.audio_device_id:
@@ -420,47 +344,37 @@ class Piper(TTSBackend):
             if s.volume is not None:
                 self._audio_sink.setVolume(s.volume)
             self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
-            self._process.start()
             self._audio_sink.start(self._utterances_being_spoken)
-        return self._process
 
-    def piper_stdout_available(self) -> None:
-        if self._utterances_being_synthesized:
+    def on_synthesis_done(self, sr, err, tb):
+        self._synthesis_done.emit(sr, err, tb)
+
+    def _on_synthesis_done(self, sr: SynthesisResult, err: Exception, tb: str):
+        if self._audio_sink is None:
+            return
+        if err is not None:
+            self._errors_from_piper.append(str(err))
+            self._errors_from_piper.append(tb)
+        else:
             u = self._utterances_being_synthesized[0]
-            while True:
-                ba = self.process.readAll()
-                if not len(ba):
-                    break
-                debug('Synthesized data read:', len(ba), 'bytes')
-                u.audio_data.append(ba)
-
-    def piper_stderr_available(self) -> None:
-        if self._process is not None:
-            def callback(ok, payload):
-                if ok:
-                    if self._utterances_being_synthesized:
-                        self._synthesis_done.emit()
+            if u.id == sr.utterance_id:
+                u.audio_data.append(sr.audio_data)
+                if sr.is_last:
+                    debug(f'Utterance {u.id} got {len(sr.audio_data)} bytes of audio data from piper')
+                    self._utterances_being_synthesized.popleft()
+                    u.synthesized = True
+                    if len(u.audio_data):
+                        self._utterances_being_spoken.add_utterance(u)
                 else:
-                    self._errors_from_piper.append(payload.decode('utf-8', 'replace'))
-            data = self._pending_stderr_data + bytes(self._process.readAllStandardError())
-            self._pending_stderr_data = detect_end_of_data(data, callback)
-
-    def _utterance_synthesized(self):
-        self.piper_stdout_available()  # just in case
-        u = self._utterances_being_synthesized.popleft()
-        u.synthesized = True
-        debug(f'Utterance {u.id} got {len(u.audio_data)} bytes of audio data from piper')
-        if len(u.audio_data):
-            self._utterances_being_spoken.add_utterance(u)
-        self._write_current_utterance()
+                    debug(f'Synthesized data read for utterance {u.id}: {len(sr.audio_data)} bytes')
+        self._queue_current_utterance()
         self._update_status()
 
     def _update_status(self):
-        if self._process is not None and self._process.state() is QProcess.ProcessState.NotRunning:
-            if self._process.exitStatus() is not QProcess.ExitStatus.NormalExit or self._process.exitCode():
-                m = '\n'.join(self._errors_from_piper)
-                self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}')
-                return
+        if self._errors_from_piper:
+            m = '\n'.join(self._errors_from_piper)
+            self._set_error(f'piper failed with error: {m}')
+            return
         if self._state is QTextToSpeech.State.Error:
             return
         state = self._utterances_being_spoken.audio_state
@@ -478,21 +392,12 @@ class Piper(TTSBackend):
             if not self._utterances_being_synthesized and not self._utterances_being_spoken:
                 self._set_state(QTextToSpeech.State.Ready)
 
-    def bytes_written(self, count: int) -> None:
-        self._write_current_utterance()
-
-    def _write_current_utterance(self) -> None:
+    def _queue_current_utterance(self) -> None:
         if self._utterances_being_synthesized:
             u = self._utterances_being_synthesized[0]
-            while len(u.left_to_write):
-                written = self.process.write(u.left_to_write)
-                if written < 0:
-                    self._set_error(f'Failed to write to piper process with error: {self.process.errorString()}')
-                    break
-                if not u.started and written:
-                    u.started = True
-                    debug(f'Utterance {u.id} synthesis started')
-                u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
+            global_piper_instance().synthesize(u.id, u.sentence)
+            u.started = True
+            debug(f'Utterance {u.id} synthesis queued')
 
     def audio_sink_state_changed(self, state: QAudio.State) -> None:
         self._update_status()
@@ -564,7 +469,7 @@ class PiperEmbedded:
         lang = get_lang()
         lang = canonicalize_lang(lang) or lang
         self._default_voice = self._voice_for_lang.get(lang) or self._voice_for_lang['eng']
-        self._current_voice = self._process = self._pipe_reader = None
+        self._current_voice = None
         self._current_audio_rate = 0
 
     def resolve_voice(self, lang: str, voice_name: str) -> Voice:
@@ -586,42 +491,27 @@ class PiperEmbedded:
         if voice is not self._current_voice:
             self._current_voice = voice
             self.shutdown()
-        self.ensure_process_started()
-        piper_done, errors_from_piper = [], []
+        self.ensure_started()
         needs_conversion = sample_rate != self._current_audio_rate
         if needs_conversion:
             from calibre_extensions.ffmpeg import resample_raw_audio_16bit
 
-        def callback(ok, payload):
-            if ok:
-                piper_done.append(True)
-            else:
-                errors_from_piper.append(payload.decode('utf-8', 'replace'))
-
         for text in texts:
             text = text.strip()
             if not text:
                 yield b'', 0.
                 continue
-            payload = json.dumps({'text': text}).encode('utf-8')
-            self._process.stdin.write(payload)
-            self._process.stdin.write(UTTERANCE_SEPARATOR)
-            self._process.stdin.flush()
-            stderr_data = b''
-            buf = io.BytesIO()
-            piper_done, errors_from_piper = [], []
+            all_data = []
+            global_piper_instance().synthesize(1, text)
+            while True:
+                sr, exc, tb = self._queue.get()
+                if exc is not None:
+                    raise Exception(f'failed to synthesize text to audio with error: {exc} and traceback: {tb}')
+                all_data.append(sr.audio_data)
+                if sr.is_last:
+                    break
 
-            def stderr_callback(data: bytes) -> bool:
-                nonlocal stderr_data
-                stderr_data = detect_end_of_data(stderr_data + data, callback)
-                return not piper_done
-
-            try:
-                self._pipe_reader(buf.write, stderr_callback)
-            except Exception as e:
-                raise Exception(f'Reading output from piper process failed with error: {e} and STDERR: ' + '\n'.join(errors_from_piper))
-
-            raw_data = buf.getvalue()
+            raw_data = b''.join(all_data)
             if needs_conversion:
                 raw_data = resample_raw_audio_16bit(raw_data, self._current_audio_rate, sample_rate)
             yield raw_data, duration_of_raw_audio_data(raw_data, sample_rate)
@@ -635,126 +525,23 @@ class PiperEmbedded:
         return True
 
     def shutdown(self):
-        if self._process is not None:
-            self._pipe_reader.close()
-            self._pipe_reader = None
-            self._process.stdin.close()
-            self._process.stdout.close()
-            self._process.stderr.close()
-            self._process.kill()
-            self._process.wait()
-            self._process = None
+        if self._current_audio_rate != 0:
+            gp = global_piper_instance_if_exists()
+            if gp is not None:
+                gp.cancel()
+            self._current_audio_rate = 0
     __del__ = shutdown
 
-    def ensure_process_started(self):
-        if self._process is not None:
-            return
-        model_path, config_path = download_voice(self._current_voice, headless=True)
-        self._current_audio_rate, cmdline = piper_process_metadata(model_path, config_path, self._embedded_settings, self._current_voice)
-        import subprocess
-        self._process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
-        self._pipe_reader = (ThreadedPipeReader if iswindows else PipeReader)(self._process.stdout, self._process.stderr)
+    def on_synthesis_done(self, sr: SynthesisResult, exc: Exception, tb: str) -> None:
+        self._queue.put((sr, exc, tb))
 
-
-class PipeReader:
-
-    TIMEOUT = 30.  # seconds
-
-    def __init__(self, stdout: BinaryIO, stderr: BinaryIO):
-        self.stdout_fd = stdout.fileno()
-        self.stderr_fd = stderr.fileno()
-        os.set_blocking(self.stdout_fd, False)
-        os.set_blocking(self.stderr_fd, False)
-
-    def close(self):
-        self.stderr_fd = self.stdout_fd = -1
-
-    def __call__(self, stdout_callback, stderr_callback):
-        from select import select
-        out, err = self.stdout_fd, self.stderr_fd
-        readers = out, err
-        buf = memoryview(bytearray(io.DEFAULT_BUFFER_SIZE))
-
-        def readall(fd: int) -> bytes:
-            output = io.BytesIO()
-            while True:
-                try:
-                    num = os.readv(fd, (buf,))
-                except BlockingIOError:
-                    break
-                else:
-                    output.write(buf[:num])
-            return output.getvalue()
-
-        def readinto(fd: int, callback) -> None:
-            while True:
-                try:
-                    num = os.readv(fd, (buf,))
-                except BlockingIOError:
-                    break
-                else:
-                    callback(buf[:num])
-
-        while True:
-            r, _, _ = select(readers, (), (), self.TIMEOUT)
-            if not r:
-                raise TimeoutError('Timed out waiting for output from piper process')
-            if out in r:
-                readinto(out, stdout_callback)
-            if err in r:
-                data = readall(err)
-                if not stderr_callback(data):
-                    # In case there is new data written to stdout
-                    readinto(out, stdout_callback)
-                    break
-
-
-class ThreadedPipeReader(PipeReader):
-
-    def __init__(self, stdout: BinaryIO, stderr: BinaryIO):
-        from queue import Queue
-        from threading import Event, Thread
-        self.shutting_down = Event()
-        self.queue = Queue()
-        Thread(target=self._reader, args=(stdout.fileno(), True), daemon=True).start()
-        Thread(target=self._reader, args=(stderr.fileno(), False), daemon=True).start()
-
-    def close(self):
-        self.shutting_down.set()
-
-    def __call__(self, stdout_callback, stderr_callback):
-        from queue import Empty
-        while True:
-            data, is_stdout, err = self.queue.get(True, self.TIMEOUT)
-            if err is not None:
-                raise err
-            if data:
-                if is_stdout:
-                    stdout_callback(data)
-                else:
-                    if not stderr_callback(data):
-                        # in case more data was written to stdout
-                        while True:
-                            try:
-                                data, is_stdout, err = self.queue.get_nowait()
-                            except Empty:
-                                break
-                            if err is not None:
-                                raise err
-                            if is_stdout:
-                                stdout_callback(data)
-                        break
-
-    def _reader(self, pipe_fd: int, is_stdout: bool):
-        while not self.shutting_down.is_set():
-            try:
-                data = os.read(pipe_fd, io.DEFAULT_BUFFER_SIZE)
-            except OSError as e:
-                if not self.shutting_down.is_set():
-                    self.queue.put((b'', is_stdout, e))
-                break
-            else:
-                self.queue.put((data, is_stdout, None))
+    def ensure_started(self):
+        if self._current_audio_rate == 0:
+            from queue import Queue
+            model_path, config_path = download_voice(self._current_voice, headless=True)
+            self._queue = Queue()
+            self._current_audio_rate = piper_process_metadata(
+                    self.on_synthesis_done, model_path, config_path, self._embedded_settings, self._current_voice)
 
 
 def duration_of_raw_audio_data(data: bytes, sample_rate: int = HIGH_QUALITY_SAMPLE_RATE, bytes_per_sample: int = 2, num_channels: int = 1) -> float:
@@ -765,25 +552,14 @@ def duration_of_raw_audio_data(data: bytes, sample_rate: int = HIGH_QUALITY_SAMP
 
 # develop {{{
 def develop_embedded():
-    import subprocess
-
-    from calibre.utils.speedups import ReadOnlyFileBuffer
-    from calibre_extensions.ffmpeg import transcode_single_audio_stream, wav_header_for_pcm_data
     p = PiperEmbedded()
-    all_data = [b'']
-    sz = 0
+    all_data = []
     for data, duration in p.text_to_raw_audio_data((
         'Hello, good day to you.', 'This is the second sentence.', 'This is the final sentence.'
     )):
         print(f'{duration=} {len(data)=}')
         all_data.append(data)
-        sz += len(data)
-    all_data[0] = wav_header_for_pcm_data(sz, HIGH_QUALITY_SAMPLE_RATE)
-    wav = ReadOnlyFileBuffer(b''.join(all_data), name='tts.wav')
-    m4a = io.BytesIO()
-    m4a.name = 'tts.m4a'
-    transcode_single_audio_stream(wav, m4a)
-    subprocess.run(['mpv', '-'], input=m4a.getvalue())
+    play_pcm_data(b''.join(all_data), HIGH_QUALITY_SAMPLE_RATE)
 
 
 def develop():
diff --git a/src/calibre/gui2/tts/types.py b/src/calibre/gui2/tts/types.py
index fd1f8548b2..40a2aad27e 100644
--- a/src/calibre/gui2/tts/types.py
+++ b/src/calibre/gui2/tts/types.py
@@ -9,7 +9,7 @@ from typing import Literal, NamedTuple
 
 from qt.core import QApplication, QLocale, QObject, QTextToSpeech, QVoice, QWidget, pyqtSignal
 
-from calibre.constants import islinux, ismacos, iswindows, piper_cmdline
+from calibre.constants import islinux, ismacos, iswindows
 from calibre.utils.config import JSONConfig
 from calibre.utils.config_base import tweaks
 from calibre.utils.localization import canonicalize_lang
@@ -234,12 +234,11 @@ def available_engines() -> dict[str, EngineMetadata]:
             ), True)
         elif x == 'speechd':
             continue
-    if piper_cmdline():
-        ans['piper'] = EngineMetadata('piper', _('The Piper Neural Engine'), _(
-            'The "piper" engine can track the currently spoken sentence on screen. It uses a neural network '
-            'for natural sounding voices. The neural network is run locally on your computer, it is fairly resource intensive to run.'
-        ), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True,
-        has_sentence_delay=True)
+    ans['piper'] = EngineMetadata('piper', _('The Piper Neural Engine'), _(
+        'The "piper" engine can track the currently spoken sentence on screen. It uses a neural network '
+        'for natural sounding voices. The neural network is run locally on your computer, it is fairly resource intensive to run.'
+    ), TrackingCapability.Sentence, can_change_pitch=False, voices_have_quality_metadata=True, has_managed_voices=True,
+    has_sentence_delay=True)
     if islinux:
         try:
             from speechd.paths import SPD_SPAWN_CMD
diff --git a/src/calibre/test_build.py b/src/calibre/test_build.py
index 8a23df108e..c825cb30ad 100644
--- a/src/calibre/test_build.py
+++ b/src/calibre/test_build.py
@@ -123,14 +123,9 @@ class BuildTest(unittest.TestCase):
         from speechd.client import SSIPClient
         del SSIPClient
 
-    @unittest.skipIf('SKIP_SPEECH_TESTS' in os.environ, 'Speech support is opted out')
     def test_piper(self):
-        import subprocess
-
-        from calibre.constants import piper_cmdline
-        self.assertTrue(piper_cmdline())
-        raw = subprocess.check_output(piper_cmdline() + ('-h',), stderr=subprocess.STDOUT).decode()
-        self.assertIn('--sentence_silence', raw)
+        from calibre.utils.tts.piper import simple_test
+        simple_test()
 
     def test_zeroconf(self):
         import ifaddr
diff --git a/src/calibre/utils/tts/piper.py b/src/calibre/utils/tts/piper.py
index aff3cd9c7e..95e7af944a 100644
--- a/src/calibre/utils/tts/piper.py
+++ b/src/calibre/utils/tts/piper.py
@@ -83,6 +83,13 @@ class SynthesisResult(NamedTuple):
     is_last: bool
 
 
+def simple_test():
+    piper.initialize(espeak_data_dir())
+    piper.set_espeak_voice_by_name('en-us')
+    if not piper.phonemize('simple test'):
+        raise ValueError('No phonemes returned by phonemize()')
+
+
 class Piper(Thread):
 
     def __init__(self):
@@ -172,6 +179,10 @@ def global_piper_instance() -> Piper:
     return _global_piper_instance
 
 
+def global_piper_instance_if_exists() -> Piper | None:
+    return _global_piper_instance
+
+
 def play_wav_data(wav_data: bytes):
     from qt.core import QAudioOutput, QBuffer, QByteArray, QCoreApplication, QIODevice, QMediaPlayer, QUrl
     app = QCoreApplication([])
@@ -190,6 +201,11 @@ def play_wav_data(wav_data: bytes):
     app.exec()
 
 
+def play_pcm_data(pcm_data, sample_rate):
+    from calibre_extensions.ffmpeg import wav_header_for_pcm_data
+    play_wav_data(wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data)
+
+
 def develop():
     from calibre.gui2.tts.piper import piper_cache_dir
     p = global_piper_instance()
@@ -210,10 +226,7 @@ def develop():
         print(f'Got {len(sr.audio_data)} bytes of audio data', flush=True)
         if sr.is_last:
             break
-    from calibre_extensions.ffmpeg import wav_header_for_pcm_data
-    pcm_data = b''.join(all_data)
-    wav_data = wav_header_for_pcm_data(len(pcm_data), sample_rate) + pcm_data
-    play_wav_data(wav_data)
+    play_pcm_data(b''.join(all_data), sample_rate)
 
 
 if __name__ == '__main__':