From 316755aa1cf00cf63d8fc11796d0f15d0b1846ce Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 1 Sep 2024 20:23:17 +0530
Subject: [PATCH] More work on piper TTS

We now synthesize text and buffer the audio data continuously for higher
performance.
---
 src/calibre/gui2/tts2/piper.py | 197 ++++++++++++++++++++++++++-------
 1 file changed, 157 insertions(+), 40 deletions(-)
diff --git a/src/calibre/gui2/tts2/piper.py b/src/calibre/gui2/tts2/piper.py
index dcc98727c7..dafc8564a0 100644
--- a/src/calibre/gui2/tts2/piper.py
+++ b/src/calibre/gui2/tts2/piper.py
@@ -2,20 +2,42 @@
 # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
 
 import atexit
+import json
 import os
 import re
 import sys
 from collections import deque
 from dataclasses import dataclass
 from functools import lru_cache
+from itertools import count
+from time import monotonic
 
-from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip
+from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip
 
-from calibre.constants import bundled_binaries_dir, iswindows
+from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows
 from calibre.gui2.tts2.types import TTSBackend
+from calibre.ptempfile import base_dir
 from calibre.spell.break_iterator import sentence_positions
 
 
+@lru_cache(2)
+def sentinel_path() -> str:
+    fname = f'piper-sentinel-{os.getpid()}'
+    if iswindows:
+        fname += f'-{get_windows_username()}'
+    else:
+        fname += f'-{os.geteuid()}'
+    return os.path.join(base_dir(), fname)
+
+
+def debug(*a, **kw):
+    if is_debugging():
+        if not hasattr(debug, 'first'):
+            debug.first = monotonic()
+        kw['end'] = kw.get('end', '\r\n')
+        print(f'[{monotonic() - debug.first:.2f}]', *a, **kw)
+
+
 @lru_cache(2)
 def piper_cmdline() -> tuple[str, ...]:
     ext = '.exe' if iswindows else ''
@@ -31,30 +53,109 @@ def piper_cmdline() -> tuple[str, ...]:
 
 @dataclass
 class Utterance:
+    id: int
     start: int
     length: int
     payload_size: int
     left_to_write: QByteArray
+    audio_data: QByteArray
 
-    synthesized: bool = False
     started: bool = False
+    synthesized: bool = False
 
 
 PARAGRAPH_SEPARATOR = '\u2029'
 UTTERANCE_SEPARATOR = b'\n'
 
 
-def split_into_utterances(text: str, lang: str = 'en'):
+class UtteranceAudioQueue(QIODevice):
+
+    saying = pyqtSignal(int, int)
+    update_status = pyqtSignal()
+
+    def __init__(self, parent: QObject | None = None):
+        super().__init__(parent)
+        self.utterances: deque[Utterance] = deque()
+        self.current_audio_data = QByteArray()
+        self.audio_state = QAudio.State.IdleState
+        self.utterance_being_played: Utterance | None = None
+        self.open(QIODeviceBase.OpenModeFlag.ReadOnly)
+
+    def audio_state_changed(self, s: QAudio.State) -> None:
+        debug('Audio state:', s)
+        prev_state, self.audio_state = self.audio_state, s
+        if s == prev_state:
+            return
+        if s == QAudio.State.IdleState and prev_state == QAudio.State.ActiveState:
+            if self.utterance_being_played:
+                debug(f'Utterance {self.utterance_being_played.id} audio output finished')
+            self.utterance_being_played = None
+            self.start_utterance()
+        self.update_status.emit()
+
+    def add_utterance(self, u: Utterance) -> None:
+        self.utterances.append(u)
+        if not self.utterance_being_played:
+            self.start_utterance()
+
+    def start_utterance(self):
+        if self.utterances:
+            u = self.utterances.popleft()
+            self.current_audio_data = u.audio_data
+            self.utterance_being_played = u
+            self.readyRead.emit()
+            self.saying.emit(u.start, u.length)
+
+    def close(self):
+        self.utterances.clear()
+        self.current_audio_data = QByteArray()
+        return super().close()
+
+    def clear(self):
+        self.utterances.clear()
+        self.current_audio_data = QByteArray()
+        self.audio_state = QAudio.State.IdleState
+
+    def atEnd(self) -> bool:
+        return not len(self.current_audio_data)
+
+    def bytesAvailable(self) -> int:
+        return len(self.current_audio_data)
+
+    def __bool__(self) -> bool:
+        return bool(self.utterances) or self.utterance_being_played is not None
+
+    def isSequential(self) -> bool:
+        return True
+
+    def seek(self, pos):
+        return False
+
+    def readData(self, maxlen: int) -> QByteArray:
+        if maxlen < 1:
+            return QByteArray()
+        if maxlen >= len(self.current_audio_data):
+            ans = self.current_audio_data
+            self.current_audio_data = QByteArray()
+        else:
+            ans = self.current_audio_data.first(maxlen)
+            self.current_audio_data = self.current_audio_data.last(len(self.current_audio_data) - maxlen)
+            if len(self.current_audio_data):
+                self.readyRead.emit()
+        return ans
+
+
+def split_into_utterances(text: str, counter: count, lang: str = 'en'):
     text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
     for start, length in sentence_positions(text, lang):
         sentence = text[start:start+length].rstrip().replace('\n', ' ')
         length = len(sentence)
-        payload = sentence.encode('utf-8')
+        payload = json.dumps({'text': sentence}).encode('utf-8')
         ba = QByteArray()
         ba.reserve(len(payload) + 1)
         ba.append(payload)
         ba.append(UTTERANCE_SEPARATOR)
-        yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length)
+        yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length)
 
 
 class Piper(TTSBackend):
@@ -65,12 +166,17 @@ class Piper(TTSBackend):
         super().__init__(parent)
         self._process: QProcess | None = None
         self._audio_sink: QAudioSink | None = None
-        self._utterances_in_flight: deque[Utterance] = deque()
+
+        self._utterances_being_synthesized: deque[Utterance] = deque()
+        self._utterance_counter = count(start=1)
+        self._utterances_being_spoken = UtteranceAudioQueue()
+        self._utterances_being_spoken.saying.connect(self.saying)
+        self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
         self._state = QTextToSpeech.State.Ready
         self._last_error = ''
         self._errors_from_piper: list[str] = []
         self._pending_stderr_data = b''
-        self._waiting_for_utterance_to_start = False
+
         self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
         atexit.register(self.shutdown)
 
@@ -85,8 +191,7 @@ class Piper(TTSBackend):
             else:
                 self._set_error(f'Failed to start piper process: {cmdline}')
             return
-        self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language
-        self._waiting_for_utterance_to_start = False
+        self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language
         self._write_current_utterance()
 
     def pause(self) -> None:
@@ -99,7 +204,7 @@ class Piper(TTSBackend):
 
     def stop(self) -> None:
         if self._process is not None:
-            if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight:
+            if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken:
                 self.shutdown()
                 self.process
 
@@ -111,6 +216,7 @@ class Piper(TTSBackend):
             # self._audio_sink.stop()
             self._process.readyReadStandardError.disconnect()
             self._process.bytesWritten.disconnect()
+            self._process.readyReadStandardOutput.disconnect()
             # self._process.stateChanged.disconnect()
             self._process.kill()
             self._process.waitForFinished(-1)
@@ -140,19 +246,21 @@ class Piper(TTSBackend):
     @property
     def process(self) -> QProcess:
         if self._process is None:
-            self._utterances_in_flight.clear()
+            self._utterances_being_spoken.clear()
+            self._utterances_being_synthesized.clear()
             self._errors_from_piper.clear()
             self._process = QProcess(self)
             self._pending_stderr_data = b''
-            self._waiting_for_utterance_to_start = False
             self._set_state(QTextToSpeech.State.Ready)
 
             model_path =  '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
             rate = 1.0  # TODO: Make rate configurable
-            cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)]
+            cmdline = list(piper_cmdline()) + [
+                '--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)]
             self._process.setProgram(cmdline[0])
             self._process.setArguments(cmdline[1:])
-            self._process.readyReadStandardError.connect(self.piper_stderr_available)
+            self._process.readyReadStandardError.connect(self.piper_stderr_available, type=Qt.ConnectionType.QueuedConnection)
+            self._process.readyReadStandardOutput.connect(self.piper_stdout_available)
             self._process.bytesWritten.connect(self.bytes_written)
             # See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html
             # self._process.stateChanged.connect(self._update_status)
@@ -161,11 +269,20 @@ class Piper(TTSBackend):
             fmt.setSampleRate(22050)  # TODO: Read this from voice JSON
             fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
             self._audio_sink = QAudioSink(fmt, self)  # TODO: Make audio device configurable
-            self._audio_sink.stateChanged.connect(self.audio_sink_state_changed)
+            self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
             self._process.start()
-            self._audio_sink.start(self._process)
+            self._audio_sink.start(self._utterances_being_spoken)
         return self._process
 
+    def piper_stdout_available(self) -> None:
+        if self._utterances_being_synthesized:
+            u = self._utterances_being_synthesized[0]
+            while True:
+                ba = self.process.readAll()
+                if not len(ba):
+                    break
+                u.audio_data.append(ba)
+
     def piper_stderr_available(self) -> None:
         needs_status_update = False
         if self._process is not None:
@@ -175,12 +292,13 @@ class Piper(TTSBackend):
                 if m := self._stderr_pat.search(line):
                     which, payload = m.group(1), m.group(2)
                     if which == b'info':
-                        if payload.startswith(b'Real-time factor:'):
-                            for u in self._utterances_in_flight:
-                                if not u.synthesized:
-                                    u.synthesized = True
-                                    needs_status_update = True
-                                    break
+                        if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized:
+                            u = self._utterances_being_synthesized.popleft()
+                            u.synthesized = True
+                            debug(f'Utterance {u.id} synthesized')
+                            needs_status_update = True
+                            self._utterances_being_spoken.add_utterance(u)
+                            self._write_current_utterance()
                     elif which == b'error':
                         self._errors_from_piper.append(payload.decode('utf-8', 'replace'))
             self._pending_stderr_data = lines[-1]
@@ -193,9 +311,10 @@ class Piper(TTSBackend):
                 m = '\n'.join(self._errors_from_piper)
                 self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}')
                 return
-        state = self._audio_sink.state()
+        if self._state is QTextToSpeech.State.Error:
+            return
+        state = self._utterances_being_spoken.audio_state
         if state is QAudio.State.ActiveState:
-            self._waiting_for_utterance_to_start = False
             self._set_state(QTextToSpeech.State.Speaking)
         elif state is QAudio.State.SuspendedState:
             self._set_state(QTextToSpeech.State.Paused)
@@ -206,29 +325,23 @@ class Piper(TTSBackend):
                 if self._state is not QTextToSpeech.State.Error:
                     self._set_state(QTextToSpeech.State.Ready)
         elif state is QAudio.State.IdleState:
-            if not self._waiting_for_utterance_to_start:
-                if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized:
-                    self._utterances_in_flight.popleft()
-                if self._utterances_in_flight:
-                    self._write_current_utterance()
-                else:
-                    self._set_state(QTextToSpeech.State.Ready)
+            if not self._utterances_being_synthesized and not self._utterances_being_spoken:
+                self._set_state(QTextToSpeech.State.Ready)
 
     def bytes_written(self, count: int) -> None:
         self._write_current_utterance()
 
     def _write_current_utterance(self) -> None:
-        if self._utterances_in_flight:
-            u = self._utterances_in_flight[0]
+        if self._utterances_being_synthesized:
+            u = self._utterances_being_synthesized[0]
             while len(u.left_to_write):
                 written = self.process.write(u.left_to_write)
                 if written < 0:
                     self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
                     break
                 if not u.started and written:
-                    self._waiting_for_utterance_to_start = True
                     u.started = True
-                    self.saying.emit(u.start, u.length)
+                    debug(f'Utterance {u.id} synthesis started')
                 u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
 
     def audio_sink_state_changed(self, state: QAudio.State) -> None:
@@ -246,10 +359,10 @@ def develop():  # {{{
     p = Piper()
     play_started = False
     def state_changed(s):
-        print(s, end='\r\n')
+        debug('TTS State:', s)
         nonlocal play_started
         if s is QTextToSpeech.State.Error:
-            print(p.error_message(), file=sys.stderr, end='\r\n')
+            debug(p.error_message(), file=sys.stderr)
             app.exit(1)
         elif s is QTextToSpeech.State.Speaking:
             play_started = True
@@ -267,10 +380,14 @@ def develop():  # {{{
             elif p.state is QTextToSpeech.State.Paused:
                 p.resume()
 
-    text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!"
+    text = (
+        'First, relatively short sentence. '
+        'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. '
+        'Third, and final short sentence.'
+    )
 
     def saying(offset, length):
-        print('Saying:', repr(text[offset:offset+length]), end='\r\n')
+        debug('Saying:', repr(text[offset:offset+length]))
 
     p.state_changed.connect(state_changed)
     p.saying.connect(saying)