More work on piper TTS

We now synthesize text and buffer the audio data continuously for higher performance.
2025-08-11 09:13:57 -04:00 · 2024-09-01 20:23:17 +05:30 · 2024-09-01 20:23:17 +05:30 · 316755aa1c
commit 316755aa1c
parent 376cbd9ed5
1 changed files with 157 additions and 40 deletions
--- a/src/calibre/gui2/tts2/piper.py
+++ b/src/calibre/gui2/tts2/piper.py
@ -2,20 +2,42 @@
 # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>

 import atexit
+import json
 import os
 import re
 import sys
 from collections import deque
 from dataclasses import dataclass
 from functools import lru_cache
+from itertools import count
+from time import monotonic

-from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip
+from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip

-from calibre.constants import bundled_binaries_dir, iswindows
+from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows
 from calibre.gui2.tts2.types import TTSBackend
+from calibre.ptempfile import base_dir
 from calibre.spell.break_iterator import sentence_positions


+@lru_cache(2)
+def sentinel_path() -> str:
+    fname = f'piper-sentinel-{os.getpid()}'
+    if iswindows:
+        fname += f'-{get_windows_username()}'
+    else:
+        fname += f'-{os.geteuid()}'
+    return os.path.join(base_dir(), fname)
+
+
+def debug(*a, **kw):
+    if is_debugging():
+        if not hasattr(debug, 'first'):
+            debug.first = monotonic()
+        kw['end'] = kw.get('end', '\r\n')
+        print(f'[{monotonic() - debug.first:.2f}]', *a, **kw)
+
+
@lru_cache(2)
 def piper_cmdline() -> tuple[str, ...]:
    ext = '.exe' if iswindows else ''
@ -31,30 +53,109 @@ def piper_cmdline() -> tuple[str, ...]:

@dataclass
 class Utterance:
+    id: int
    start: int
    length: int
    payload_size: int
    left_to_write: QByteArray
+    audio_data: QByteArray

-    synthesized: bool = False
    started: bool = False
+    synthesized: bool = False


 PARAGRAPH_SEPARATOR = '\u2029'
 UTTERANCE_SEPARATOR = b'\n'


-def split_into_utterances(text: str, lang: str = 'en'):
+class UtteranceAudioQueue(QIODevice):
+
+    saying = pyqtSignal(int, int)
+    update_status = pyqtSignal()
+
+    def __init__(self, parent: QObject | None = None):
+        super().__init__(parent)
+        self.utterances: deque[Utterance] = deque()
+        self.current_audio_data = QByteArray()
+        self.audio_state = QAudio.State.IdleState
+        self.utterance_being_played: Utterance | None = None
+        self.open(QIODeviceBase.OpenModeFlag.ReadOnly)
+
+    def audio_state_changed(self, s: QAudio.State) -> None:
+        debug('Audio state:', s)
+        prev_state, self.audio_state = self.audio_state, s
+        if s == prev_state:
+            return
+        if s == QAudio.State.IdleState and prev_state == QAudio.State.ActiveState:
+            if self.utterance_being_played:
+                debug(f'Utterance {self.utterance_being_played.id} audio output finished')
+            self.utterance_being_played = None
+            self.start_utterance()
+        self.update_status.emit()
+
+    def add_utterance(self, u: Utterance) -> None:
+        self.utterances.append(u)
+        if not self.utterance_being_played:
+            self.start_utterance()
+
+    def start_utterance(self):
+        if self.utterances:
+            u = self.utterances.popleft()
+            self.current_audio_data = u.audio_data
+            self.utterance_being_played = u
+            self.readyRead.emit()
+            self.saying.emit(u.start, u.length)
+
+    def close(self):
+        self.utterances.clear()
+        self.current_audio_data = QByteArray()
+        return super().close()
+
+    def clear(self):
+        self.utterances.clear()
+        self.current_audio_data = QByteArray()
+        self.audio_state = QAudio.State.IdleState
+
+    def atEnd(self) -> bool:
+        return not len(self.current_audio_data)
+
+    def bytesAvailable(self) -> int:
+        return len(self.current_audio_data)
+
+    def __bool__(self) -> bool:
+        return bool(self.utterances) or self.utterance_being_played is not None
+
+    def isSequential(self) -> bool:
+        return True
+
+    def seek(self, pos):
+        return False
+
+    def readData(self, maxlen: int) -> QByteArray:
+        if maxlen < 1:
+            return QByteArray()
+        if maxlen >= len(self.current_audio_data):
+            ans = self.current_audio_data
+            self.current_audio_data = QByteArray()
+        else:
+            ans = self.current_audio_data.first(maxlen)
+            self.current_audio_data = self.current_audio_data.last(len(self.current_audio_data) - maxlen)
+            if len(self.current_audio_data):
+                self.readyRead.emit()
+        return ans
+
+
+def split_into_utterances(text: str, counter: count, lang: str = 'en'):
    text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
    for start, length in sentence_positions(text, lang):
        sentence = text[start:start+length].rstrip().replace('\n', ' ')
        length = len(sentence)
-        payload = sentence.encode('utf-8')
+        payload = json.dumps({'text': sentence}).encode('utf-8')
        ba = QByteArray()
        ba.reserve(len(payload) + 1)
        ba.append(payload)
        ba.append(UTTERANCE_SEPARATOR)
-        yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length)
+        yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length)


 class Piper(TTSBackend):
@ -65,12 +166,17 @@ class Piper(TTSBackend):
        super().__init__(parent)
        self._process: QProcess | None = None
        self._audio_sink: QAudioSink | None = None
-        self._utterances_in_flight: deque[Utterance] = deque()
+
+        self._utterances_being_synthesized: deque[Utterance] = deque()
+        self._utterance_counter = count(start=1)
+        self._utterances_being_spoken = UtteranceAudioQueue()
+        self._utterances_being_spoken.saying.connect(self.saying)
+        self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
        self._state = QTextToSpeech.State.Ready
        self._last_error = ''
        self._errors_from_piper: list[str] = []
        self._pending_stderr_data = b''
-        self._waiting_for_utterance_to_start = False
+
        self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
        atexit.register(self.shutdown)

@ -85,8 +191,7 @@ class Piper(TTSBackend):
            else:
                self._set_error(f'Failed to start piper process: {cmdline}')
            return
-        self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language
-        self._waiting_for_utterance_to_start = False
+        self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language
        self._write_current_utterance()

    def pause(self) -> None:
@ -99,7 +204,7 @@ class Piper(TTSBackend):

    def stop(self) -> None:
        if self._process is not None:
-            if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight:
+            if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken:
                self.shutdown()
                self.process

@ -111,6 +216,7 @@ class Piper(TTSBackend):
            # self._audio_sink.stop()
            self._process.readyReadStandardError.disconnect()
            self._process.bytesWritten.disconnect()
+            self._process.readyReadStandardOutput.disconnect()
            # self._process.stateChanged.disconnect()
            self._process.kill()
            self._process.waitForFinished(-1)
@ -140,19 +246,21 @@ class Piper(TTSBackend):
    @property
    def process(self) -> QProcess:
        if self._process is None:
-            self._utterances_in_flight.clear()
+            self._utterances_being_spoken.clear()
+            self._utterances_being_synthesized.clear()
            self._errors_from_piper.clear()
            self._process = QProcess(self)
            self._pending_stderr_data = b''
-            self._waiting_for_utterance_to_start = False
            self._set_state(QTextToSpeech.State.Ready)

            model_path =  '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
            rate = 1.0  # TODO: Make rate configurable
-            cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)]
+            cmdline = list(piper_cmdline()) + [
+                '--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)]
            self._process.setProgram(cmdline[0])
            self._process.setArguments(cmdline[1:])
-            self._process.readyReadStandardError.connect(self.piper_stderr_available)
+            self._process.readyReadStandardError.connect(self.piper_stderr_available, type=Qt.ConnectionType.QueuedConnection)
+            self._process.readyReadStandardOutput.connect(self.piper_stdout_available)
            self._process.bytesWritten.connect(self.bytes_written)
            # See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html
            # self._process.stateChanged.connect(self._update_status)
@ -161,11 +269,20 @@ class Piper(TTSBackend):
            fmt.setSampleRate(22050)  # TODO: Read this from voice JSON
            fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
            self._audio_sink = QAudioSink(fmt, self)  # TODO: Make audio device configurable
-            self._audio_sink.stateChanged.connect(self.audio_sink_state_changed)
+            self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
            self._process.start()
-            self._audio_sink.start(self._process)
+            self._audio_sink.start(self._utterances_being_spoken)
        return self._process

+    def piper_stdout_available(self) -> None:
+        if self._utterances_being_synthesized:
+            u = self._utterances_being_synthesized[0]
+            while True:
+                ba = self.process.readAll()
+                if not len(ba):
+                    break
+                u.audio_data.append(ba)
+
    def piper_stderr_available(self) -> None:
        needs_status_update = False
        if self._process is not None:
@ -175,12 +292,13 @@ class Piper(TTSBackend):
                if m := self._stderr_pat.search(line):
                    which, payload = m.group(1), m.group(2)
                    if which == b'info':
-                        if payload.startswith(b'Real-time factor:'):
-                            for u in self._utterances_in_flight:
-                                if not u.synthesized:
-                                    u.synthesized = True
-                                    needs_status_update = True
-                                    break
+                        if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized:
+                            u = self._utterances_being_synthesized.popleft()
+                            u.synthesized = True
+                            debug(f'Utterance {u.id} synthesized')
+                            needs_status_update = True
+                            self._utterances_being_spoken.add_utterance(u)
+                            self._write_current_utterance()
                    elif which == b'error':
                        self._errors_from_piper.append(payload.decode('utf-8', 'replace'))
            self._pending_stderr_data = lines[-1]
@ -193,9 +311,10 @@ class Piper(TTSBackend):
                m = '\n'.join(self._errors_from_piper)
                self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}')
                return
-        state = self._audio_sink.state()
+        if self._state is QTextToSpeech.State.Error:
+            return
+        state = self._utterances_being_spoken.audio_state
        if state is QAudio.State.ActiveState:
-            self._waiting_for_utterance_to_start = False
            self._set_state(QTextToSpeech.State.Speaking)
        elif state is QAudio.State.SuspendedState:
            self._set_state(QTextToSpeech.State.Paused)
@ -206,29 +325,23 @@ class Piper(TTSBackend):
                if self._state is not QTextToSpeech.State.Error:
                    self._set_state(QTextToSpeech.State.Ready)
        elif state is QAudio.State.IdleState:
-            if not self._waiting_for_utterance_to_start:
-                if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized:
-                    self._utterances_in_flight.popleft()
-                if self._utterances_in_flight:
-                    self._write_current_utterance()
-                else:
-                    self._set_state(QTextToSpeech.State.Ready)
+            if not self._utterances_being_synthesized and not self._utterances_being_spoken:
+                self._set_state(QTextToSpeech.State.Ready)

    def bytes_written(self, count: int) -> None:
        self._write_current_utterance()

    def _write_current_utterance(self) -> None:
-        if self._utterances_in_flight:
-            u = self._utterances_in_flight[0]
+        if self._utterances_being_synthesized:
+            u = self._utterances_being_synthesized[0]
            while len(u.left_to_write):
                written = self.process.write(u.left_to_write)
                if written < 0:
                    self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
                    break
                if not u.started and written:
-                    self._waiting_for_utterance_to_start = True
                    u.started = True
-                    self.saying.emit(u.start, u.length)
+                    debug(f'Utterance {u.id} synthesis started')
                u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)

    def audio_sink_state_changed(self, state: QAudio.State) -> None:
@ -246,10 +359,10 @@ def develop():  # {{{
    p = Piper()
    play_started = False
    def state_changed(s):
-        print(s, end='\r\n')
+        debug('TTS State:', s)
        nonlocal play_started
        if s is QTextToSpeech.State.Error:
-            print(p.error_message(), file=sys.stderr, end='\r\n')
+            debug(p.error_message(), file=sys.stderr)
            app.exit(1)
        elif s is QTextToSpeech.State.Speaking:
            play_started = True
@ -267,10 +380,14 @@ def develop():  # {{{
            elif p.state is QTextToSpeech.State.Paused:
                p.resume()

-    text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!"
+    text = (
+        'First, relatively short sentence. '
+        'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. '
+        'Third, and final short sentence.'
+    )

    def saying(offset, length):
-        print('Saying:', repr(text[offset:offset+length]), end='\r\n')
+        debug('Saying:', repr(text[offset:offset+length]))

    p.state_changed.connect(state_changed)
    p.saying.connect(saying)