From 316755aa1cf00cf63d8fc11796d0f15d0b1846ce Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 1 Sep 2024 20:23:17 +0530 Subject: [PATCH] More work on piper TTS We now synthesize text and buffer the audio data continuously for higher performance. --- src/calibre/gui2/tts2/piper.py | 197 ++++++++++++++++++++++++++------- 1 file changed, 157 insertions(+), 40 deletions(-) diff --git a/src/calibre/gui2/tts2/piper.py b/src/calibre/gui2/tts2/piper.py index dcc98727c7..dafc8564a0 100644 --- a/src/calibre/gui2/tts2/piper.py +++ b/src/calibre/gui2/tts2/piper.py @@ -2,20 +2,42 @@ # License: GPLv3 Copyright: 2024, Kovid Goyal import atexit +import json import os import re import sys from collections import deque from dataclasses import dataclass from functools import lru_cache +from itertools import count +from time import monotonic -from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip +from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip -from calibre.constants import bundled_binaries_dir, iswindows +from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows from calibre.gui2.tts2.types import TTSBackend +from calibre.ptempfile import base_dir from calibre.spell.break_iterator import sentence_positions +@lru_cache(2) +def sentinel_path() -> str: + fname = f'piper-sentinel-{os.getpid()}' + if iswindows: + fname += f'-{get_windows_username()}' + else: + fname += f'-{os.geteuid()}' + return os.path.join(base_dir(), fname) + + +def debug(*a, **kw): + if is_debugging(): + if not hasattr(debug, 'first'): + debug.first = monotonic() + kw['end'] = kw.get('end', '\r\n') + print(f'[{monotonic() - debug.first:.2f}]', *a, **kw) + + @lru_cache(2) def piper_cmdline() -> tuple[str, ...]: ext = '.exe' if iswindows else '' @@ -31,30 +53,109 @@ def piper_cmdline() -> tuple[str, ...]: @dataclass class Utterance: + id: int start: int length: int payload_size: int left_to_write: QByteArray + audio_data: QByteArray - synthesized: bool = False started: bool = False + synthesized: bool = False PARAGRAPH_SEPARATOR = '\u2029' UTTERANCE_SEPARATOR = b'\n' -def split_into_utterances(text: str, lang: str = 'en'): +class UtteranceAudioQueue(QIODevice): + + saying = pyqtSignal(int, int) + update_status = pyqtSignal() + + def __init__(self, parent: QObject | None = None): + super().__init__(parent) + self.utterances: deque[Utterance] = deque() + self.current_audio_data = QByteArray() + self.audio_state = QAudio.State.IdleState + self.utterance_being_played: Utterance | None = None + self.open(QIODeviceBase.OpenModeFlag.ReadOnly) + + def audio_state_changed(self, s: QAudio.State) -> None: + debug('Audio state:', s) + prev_state, self.audio_state = self.audio_state, s + if s == prev_state: + return + if s == QAudio.State.IdleState and prev_state == QAudio.State.ActiveState: + if self.utterance_being_played: + debug(f'Utterance {self.utterance_being_played.id} audio output finished') + self.utterance_being_played = None + self.start_utterance() + self.update_status.emit() + + def add_utterance(self, u: Utterance) -> None: + self.utterances.append(u) + if not self.utterance_being_played: + self.start_utterance() + + def start_utterance(self): + if self.utterances: + u = self.utterances.popleft() + self.current_audio_data = u.audio_data + self.utterance_being_played = u + self.readyRead.emit() + self.saying.emit(u.start, u.length) + + def close(self): + self.utterances.clear() + self.current_audio_data = QByteArray() + return super().close() + + def clear(self): + self.utterances.clear() + self.current_audio_data = QByteArray() + self.audio_state = QAudio.State.IdleState + + def atEnd(self) -> bool: + return not len(self.current_audio_data) + + def bytesAvailable(self) -> int: + return len(self.current_audio_data) + + def __bool__(self) -> bool: + return bool(self.utterances) or self.utterance_being_played is not None + + def isSequential(self) -> bool: + return True + + def seek(self, pos): + return False + + def readData(self, maxlen: int) -> QByteArray: + if maxlen < 1: + return QByteArray() + if maxlen >= len(self.current_audio_data): + ans = self.current_audio_data + self.current_audio_data = QByteArray() + else: + ans = self.current_audio_data.first(maxlen) + self.current_audio_data = self.current_audio_data.last(len(self.current_audio_data) - maxlen) + if len(self.current_audio_data): + self.readyRead.emit() + return ans + + +def split_into_utterances(text: str, counter: count, lang: str = 'en'): text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ') for start, length in sentence_positions(text, lang): sentence = text[start:start+length].rstrip().replace('\n', ' ') length = len(sentence) - payload = sentence.encode('utf-8') + payload = json.dumps({'text': sentence}).encode('utf-8') ba = QByteArray() ba.reserve(len(payload) + 1) ba.append(payload) ba.append(UTTERANCE_SEPARATOR) - yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length) + yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length) class Piper(TTSBackend): @@ -65,12 +166,17 @@ class Piper(TTSBackend): super().__init__(parent) self._process: QProcess | None = None self._audio_sink: QAudioSink | None = None - self._utterances_in_flight: deque[Utterance] = deque() + + self._utterances_being_synthesized: deque[Utterance] = deque() + self._utterance_counter = count(start=1) + self._utterances_being_spoken = UtteranceAudioQueue() + self._utterances_being_spoken.saying.connect(self.saying) + self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection) self._state = QTextToSpeech.State.Ready self._last_error = '' self._errors_from_piper: list[str] = [] self._pending_stderr_data = b'' - self._waiting_for_utterance_to_start = False + self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)') atexit.register(self.shutdown) @@ -85,8 +191,7 @@ class Piper(TTSBackend): else: self._set_error(f'Failed to start piper process: {cmdline}') return - self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language - self._waiting_for_utterance_to_start = False + self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language self._write_current_utterance() def pause(self) -> None: @@ -99,7 +204,7 @@ class Piper(TTSBackend): def stop(self) -> None: if self._process is not None: - if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight: + if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken: self.shutdown() self.process @@ -111,6 +216,7 @@ class Piper(TTSBackend): # self._audio_sink.stop() self._process.readyReadStandardError.disconnect() self._process.bytesWritten.disconnect() + self._process.readyReadStandardOutput.disconnect() # self._process.stateChanged.disconnect() self._process.kill() self._process.waitForFinished(-1) @@ -140,19 +246,21 @@ class Piper(TTSBackend): @property def process(self) -> QProcess: if self._process is None: - self._utterances_in_flight.clear() + self._utterances_being_spoken.clear() + self._utterances_being_synthesized.clear() self._errors_from_piper.clear() self._process = QProcess(self) self._pending_stderr_data = b'' - self._waiting_for_utterance_to_start = False self._set_state(QTextToSpeech.State.Ready) model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice rate = 1.0 # TODO: Make rate configurable - cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)] + cmdline = list(piper_cmdline()) + [ + '--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)] self._process.setProgram(cmdline[0]) self._process.setArguments(cmdline[1:]) - self._process.readyReadStandardError.connect(self.piper_stderr_available) + self._process.readyReadStandardError.connect(self.piper_stderr_available, type=Qt.ConnectionType.QueuedConnection) + self._process.readyReadStandardOutput.connect(self.piper_stdout_available) self._process.bytesWritten.connect(self.bytes_written) # See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html # self._process.stateChanged.connect(self._update_status) @@ -161,11 +269,20 @@ class Piper(TTSBackend): fmt.setSampleRate(22050) # TODO: Read this from voice JSON fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono) self._audio_sink = QAudioSink(fmt, self) # TODO: Make audio device configurable - self._audio_sink.stateChanged.connect(self.audio_sink_state_changed) + self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed) self._process.start() - self._audio_sink.start(self._process) + self._audio_sink.start(self._utterances_being_spoken) return self._process + def piper_stdout_available(self) -> None: + if self._utterances_being_synthesized: + u = self._utterances_being_synthesized[0] + while True: + ba = self.process.readAll() + if not len(ba): + break + u.audio_data.append(ba) + def piper_stderr_available(self) -> None: needs_status_update = False if self._process is not None: @@ -175,12 +292,13 @@ class Piper(TTSBackend): if m := self._stderr_pat.search(line): which, payload = m.group(1), m.group(2) if which == b'info': - if payload.startswith(b'Real-time factor:'): - for u in self._utterances_in_flight: - if not u.synthesized: - u.synthesized = True - needs_status_update = True - break + if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized: + u = self._utterances_being_synthesized.popleft() + u.synthesized = True + debug(f'Utterance {u.id} synthesized') + needs_status_update = True + self._utterances_being_spoken.add_utterance(u) + self._write_current_utterance() elif which == b'error': self._errors_from_piper.append(payload.decode('utf-8', 'replace')) self._pending_stderr_data = lines[-1] @@ -193,9 +311,10 @@ class Piper(TTSBackend): m = '\n'.join(self._errors_from_piper) self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}') return - state = self._audio_sink.state() + if self._state is QTextToSpeech.State.Error: + return + state = self._utterances_being_spoken.audio_state if state is QAudio.State.ActiveState: - self._waiting_for_utterance_to_start = False self._set_state(QTextToSpeech.State.Speaking) elif state is QAudio.State.SuspendedState: self._set_state(QTextToSpeech.State.Paused) @@ -206,29 +325,23 @@ class Piper(TTSBackend): if self._state is not QTextToSpeech.State.Error: self._set_state(QTextToSpeech.State.Ready) elif state is QAudio.State.IdleState: - if not self._waiting_for_utterance_to_start: - if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized: - self._utterances_in_flight.popleft() - if self._utterances_in_flight: - self._write_current_utterance() - else: - self._set_state(QTextToSpeech.State.Ready) + if not self._utterances_being_synthesized and not self._utterances_being_spoken: + self._set_state(QTextToSpeech.State.Ready) def bytes_written(self, count: int) -> None: self._write_current_utterance() def _write_current_utterance(self) -> None: - if self._utterances_in_flight: - u = self._utterances_in_flight[0] + if self._utterances_being_synthesized: + u = self._utterances_being_synthesized[0] while len(u.left_to_write): written = self.process.write(u.left_to_write) if written < 0: self._set_error('Failed to write to piper process with error: {self.process.errorString()}') break if not u.started and written: - self._waiting_for_utterance_to_start = True u.started = True - self.saying.emit(u.start, u.length) + debug(f'Utterance {u.id} synthesis started') u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written) def audio_sink_state_changed(self, state: QAudio.State) -> None: @@ -246,10 +359,10 @@ def develop(): # {{{ p = Piper() play_started = False def state_changed(s): - print(s, end='\r\n') + debug('TTS State:', s) nonlocal play_started if s is QTextToSpeech.State.Error: - print(p.error_message(), file=sys.stderr, end='\r\n') + debug(p.error_message(), file=sys.stderr) app.exit(1) elif s is QTextToSpeech.State.Speaking: play_started = True @@ -267,10 +380,14 @@ def develop(): # {{{ elif p.state is QTextToSpeech.State.Paused: p.resume() - text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!" + text = ( + 'First, relatively short sentence. ' + 'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. ' + 'Third, and final short sentence.' + ) def saying(offset, length): - print('Saying:', repr(text[offset:offset+length]), end='\r\n') + debug('Saying:', repr(text[offset:offset+length])) p.state_changed.connect(state_changed) p.saying.connect(saying)