More work on piper TTS

We now synthesize text and buffer the audio data continuously for higher
performance.
This commit is contained in:
Kovid Goyal 2024-09-01 20:23:17 +05:30
parent 376cbd9ed5
commit 316755aa1c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,20 +2,42 @@
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import atexit import atexit
import json
import os import os
import re import re
import sys import sys
from collections import deque from collections import deque
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from itertools import count
from time import monotonic
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip
from calibre.constants import bundled_binaries_dir, iswindows from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows
from calibre.gui2.tts2.types import TTSBackend from calibre.gui2.tts2.types import TTSBackend
from calibre.ptempfile import base_dir
from calibre.spell.break_iterator import sentence_positions from calibre.spell.break_iterator import sentence_positions
@lru_cache(2)
def sentinel_path() -> str:
fname = f'piper-sentinel-{os.getpid()}'
if iswindows:
fname += f'-{get_windows_username()}'
else:
fname += f'-{os.geteuid()}'
return os.path.join(base_dir(), fname)
def debug(*a, **kw):
if is_debugging():
if not hasattr(debug, 'first'):
debug.first = monotonic()
kw['end'] = kw.get('end', '\r\n')
print(f'[{monotonic() - debug.first:.2f}]', *a, **kw)
@lru_cache(2) @lru_cache(2)
def piper_cmdline() -> tuple[str, ...]: def piper_cmdline() -> tuple[str, ...]:
ext = '.exe' if iswindows else '' ext = '.exe' if iswindows else ''
@ -31,30 +53,109 @@ def piper_cmdline() -> tuple[str, ...]:
@dataclass @dataclass
class Utterance: class Utterance:
id: int
start: int start: int
length: int length: int
payload_size: int payload_size: int
left_to_write: QByteArray left_to_write: QByteArray
audio_data: QByteArray
synthesized: bool = False
started: bool = False started: bool = False
synthesized: bool = False
PARAGRAPH_SEPARATOR = '\u2029' PARAGRAPH_SEPARATOR = '\u2029'
UTTERANCE_SEPARATOR = b'\n' UTTERANCE_SEPARATOR = b'\n'
def split_into_utterances(text: str, lang: str = 'en'): class UtteranceAudioQueue(QIODevice):
saying = pyqtSignal(int, int)
update_status = pyqtSignal()
def __init__(self, parent: QObject | None = None):
super().__init__(parent)
self.utterances: deque[Utterance] = deque()
self.current_audio_data = QByteArray()
self.audio_state = QAudio.State.IdleState
self.utterance_being_played: Utterance | None = None
self.open(QIODeviceBase.OpenModeFlag.ReadOnly)
def audio_state_changed(self, s: QAudio.State) -> None:
debug('Audio state:', s)
prev_state, self.audio_state = self.audio_state, s
if s == prev_state:
return
if s == QAudio.State.IdleState and prev_state == QAudio.State.ActiveState:
if self.utterance_being_played:
debug(f'Utterance {self.utterance_being_played.id} audio output finished')
self.utterance_being_played = None
self.start_utterance()
self.update_status.emit()
def add_utterance(self, u: Utterance) -> None:
self.utterances.append(u)
if not self.utterance_being_played:
self.start_utterance()
def start_utterance(self):
if self.utterances:
u = self.utterances.popleft()
self.current_audio_data = u.audio_data
self.utterance_being_played = u
self.readyRead.emit()
self.saying.emit(u.start, u.length)
def close(self):
self.utterances.clear()
self.current_audio_data = QByteArray()
return super().close()
def clear(self):
self.utterances.clear()
self.current_audio_data = QByteArray()
self.audio_state = QAudio.State.IdleState
def atEnd(self) -> bool:
return not len(self.current_audio_data)
def bytesAvailable(self) -> int:
return len(self.current_audio_data)
def __bool__(self) -> bool:
return bool(self.utterances) or self.utterance_being_played is not None
def isSequential(self) -> bool:
return True
def seek(self, pos):
return False
def readData(self, maxlen: int) -> QByteArray:
if maxlen < 1:
return QByteArray()
if maxlen >= len(self.current_audio_data):
ans = self.current_audio_data
self.current_audio_data = QByteArray()
else:
ans = self.current_audio_data.first(maxlen)
self.current_audio_data = self.current_audio_data.last(len(self.current_audio_data) - maxlen)
if len(self.current_audio_data):
self.readyRead.emit()
return ans
def split_into_utterances(text: str, counter: count, lang: str = 'en'):
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ') text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
for start, length in sentence_positions(text, lang): for start, length in sentence_positions(text, lang):
sentence = text[start:start+length].rstrip().replace('\n', ' ') sentence = text[start:start+length].rstrip().replace('\n', ' ')
length = len(sentence) length = len(sentence)
payload = sentence.encode('utf-8') payload = json.dumps({'text': sentence}).encode('utf-8')
ba = QByteArray() ba = QByteArray()
ba.reserve(len(payload) + 1) ba.reserve(len(payload) + 1)
ba.append(payload) ba.append(payload)
ba.append(UTTERANCE_SEPARATOR) ba.append(UTTERANCE_SEPARATOR)
yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length) yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length)
class Piper(TTSBackend): class Piper(TTSBackend):
@ -65,12 +166,17 @@ class Piper(TTSBackend):
super().__init__(parent) super().__init__(parent)
self._process: QProcess | None = None self._process: QProcess | None = None
self._audio_sink: QAudioSink | None = None self._audio_sink: QAudioSink | None = None
self._utterances_in_flight: deque[Utterance] = deque()
self._utterances_being_synthesized: deque[Utterance] = deque()
self._utterance_counter = count(start=1)
self._utterances_being_spoken = UtteranceAudioQueue()
self._utterances_being_spoken.saying.connect(self.saying)
self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
self._state = QTextToSpeech.State.Ready self._state = QTextToSpeech.State.Ready
self._last_error = '' self._last_error = ''
self._errors_from_piper: list[str] = [] self._errors_from_piper: list[str] = []
self._pending_stderr_data = b'' self._pending_stderr_data = b''
self._waiting_for_utterance_to_start = False
self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)') self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
atexit.register(self.shutdown) atexit.register(self.shutdown)
@ -85,8 +191,7 @@ class Piper(TTSBackend):
else: else:
self._set_error(f'Failed to start piper process: {cmdline}') self._set_error(f'Failed to start piper process: {cmdline}')
return return
self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language
self._waiting_for_utterance_to_start = False
self._write_current_utterance() self._write_current_utterance()
def pause(self) -> None: def pause(self) -> None:
@ -99,7 +204,7 @@ class Piper(TTSBackend):
def stop(self) -> None: def stop(self) -> None:
if self._process is not None: if self._process is not None:
if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight: if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken:
self.shutdown() self.shutdown()
self.process self.process
@ -111,6 +216,7 @@ class Piper(TTSBackend):
# self._audio_sink.stop() # self._audio_sink.stop()
self._process.readyReadStandardError.disconnect() self._process.readyReadStandardError.disconnect()
self._process.bytesWritten.disconnect() self._process.bytesWritten.disconnect()
self._process.readyReadStandardOutput.disconnect()
# self._process.stateChanged.disconnect() # self._process.stateChanged.disconnect()
self._process.kill() self._process.kill()
self._process.waitForFinished(-1) self._process.waitForFinished(-1)
@ -140,19 +246,21 @@ class Piper(TTSBackend):
@property @property
def process(self) -> QProcess: def process(self) -> QProcess:
if self._process is None: if self._process is None:
self._utterances_in_flight.clear() self._utterances_being_spoken.clear()
self._utterances_being_synthesized.clear()
self._errors_from_piper.clear() self._errors_from_piper.clear()
self._process = QProcess(self) self._process = QProcess(self)
self._pending_stderr_data = b'' self._pending_stderr_data = b''
self._waiting_for_utterance_to_start = False
self._set_state(QTextToSpeech.State.Ready) self._set_state(QTextToSpeech.State.Ready)
model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
rate = 1.0 # TODO: Make rate configurable rate = 1.0 # TODO: Make rate configurable
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)] cmdline = list(piper_cmdline()) + [
'--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)]
self._process.setProgram(cmdline[0]) self._process.setProgram(cmdline[0])
self._process.setArguments(cmdline[1:]) self._process.setArguments(cmdline[1:])
self._process.readyReadStandardError.connect(self.piper_stderr_available) self._process.readyReadStandardError.connect(self.piper_stderr_available, type=Qt.ConnectionType.QueuedConnection)
self._process.readyReadStandardOutput.connect(self.piper_stdout_available)
self._process.bytesWritten.connect(self.bytes_written) self._process.bytesWritten.connect(self.bytes_written)
# See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html # See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html
# self._process.stateChanged.connect(self._update_status) # self._process.stateChanged.connect(self._update_status)
@ -161,11 +269,20 @@ class Piper(TTSBackend):
fmt.setSampleRate(22050) # TODO: Read this from voice JSON fmt.setSampleRate(22050) # TODO: Read this from voice JSON
fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono) fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
self._audio_sink = QAudioSink(fmt, self) # TODO: Make audio device configurable self._audio_sink = QAudioSink(fmt, self) # TODO: Make audio device configurable
self._audio_sink.stateChanged.connect(self.audio_sink_state_changed) self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
self._process.start() self._process.start()
self._audio_sink.start(self._process) self._audio_sink.start(self._utterances_being_spoken)
return self._process return self._process
def piper_stdout_available(self) -> None:
if self._utterances_being_synthesized:
u = self._utterances_being_synthesized[0]
while True:
ba = self.process.readAll()
if not len(ba):
break
u.audio_data.append(ba)
def piper_stderr_available(self) -> None: def piper_stderr_available(self) -> None:
needs_status_update = False needs_status_update = False
if self._process is not None: if self._process is not None:
@ -175,12 +292,13 @@ class Piper(TTSBackend):
if m := self._stderr_pat.search(line): if m := self._stderr_pat.search(line):
which, payload = m.group(1), m.group(2) which, payload = m.group(1), m.group(2)
if which == b'info': if which == b'info':
if payload.startswith(b'Real-time factor:'): if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized:
for u in self._utterances_in_flight: u = self._utterances_being_synthesized.popleft()
if not u.synthesized: u.synthesized = True
u.synthesized = True debug(f'Utterance {u.id} synthesized')
needs_status_update = True needs_status_update = True
break self._utterances_being_spoken.add_utterance(u)
self._write_current_utterance()
elif which == b'error': elif which == b'error':
self._errors_from_piper.append(payload.decode('utf-8', 'replace')) self._errors_from_piper.append(payload.decode('utf-8', 'replace'))
self._pending_stderr_data = lines[-1] self._pending_stderr_data = lines[-1]
@ -193,9 +311,10 @@ class Piper(TTSBackend):
m = '\n'.join(self._errors_from_piper) m = '\n'.join(self._errors_from_piper)
self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}') self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}')
return return
state = self._audio_sink.state() if self._state is QTextToSpeech.State.Error:
return
state = self._utterances_being_spoken.audio_state
if state is QAudio.State.ActiveState: if state is QAudio.State.ActiveState:
self._waiting_for_utterance_to_start = False
self._set_state(QTextToSpeech.State.Speaking) self._set_state(QTextToSpeech.State.Speaking)
elif state is QAudio.State.SuspendedState: elif state is QAudio.State.SuspendedState:
self._set_state(QTextToSpeech.State.Paused) self._set_state(QTextToSpeech.State.Paused)
@ -206,29 +325,23 @@ class Piper(TTSBackend):
if self._state is not QTextToSpeech.State.Error: if self._state is not QTextToSpeech.State.Error:
self._set_state(QTextToSpeech.State.Ready) self._set_state(QTextToSpeech.State.Ready)
elif state is QAudio.State.IdleState: elif state is QAudio.State.IdleState:
if not self._waiting_for_utterance_to_start: if not self._utterances_being_synthesized and not self._utterances_being_spoken:
if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized: self._set_state(QTextToSpeech.State.Ready)
self._utterances_in_flight.popleft()
if self._utterances_in_flight:
self._write_current_utterance()
else:
self._set_state(QTextToSpeech.State.Ready)
def bytes_written(self, count: int) -> None: def bytes_written(self, count: int) -> None:
self._write_current_utterance() self._write_current_utterance()
def _write_current_utterance(self) -> None: def _write_current_utterance(self) -> None:
if self._utterances_in_flight: if self._utterances_being_synthesized:
u = self._utterances_in_flight[0] u = self._utterances_being_synthesized[0]
while len(u.left_to_write): while len(u.left_to_write):
written = self.process.write(u.left_to_write) written = self.process.write(u.left_to_write)
if written < 0: if written < 0:
self._set_error('Failed to write to piper process with error: {self.process.errorString()}') self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
break break
if not u.started and written: if not u.started and written:
self._waiting_for_utterance_to_start = True
u.started = True u.started = True
self.saying.emit(u.start, u.length) debug(f'Utterance {u.id} synthesis started')
u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written) u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
def audio_sink_state_changed(self, state: QAudio.State) -> None: def audio_sink_state_changed(self, state: QAudio.State) -> None:
@ -246,10 +359,10 @@ def develop(): # {{{
p = Piper() p = Piper()
play_started = False play_started = False
def state_changed(s): def state_changed(s):
print(s, end='\r\n') debug('TTS State:', s)
nonlocal play_started nonlocal play_started
if s is QTextToSpeech.State.Error: if s is QTextToSpeech.State.Error:
print(p.error_message(), file=sys.stderr, end='\r\n') debug(p.error_message(), file=sys.stderr)
app.exit(1) app.exit(1)
elif s is QTextToSpeech.State.Speaking: elif s is QTextToSpeech.State.Speaking:
play_started = True play_started = True
@ -267,10 +380,14 @@ def develop(): # {{{
elif p.state is QTextToSpeech.State.Paused: elif p.state is QTextToSpeech.State.Paused:
p.resume() p.resume()
text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!" text = (
'First, relatively short sentence. '
'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. '
'Third, and final short sentence.'
)
def saying(offset, length): def saying(offset, length):
print('Saying:', repr(text[offset:offset+length]), end='\r\n') debug('Saying:', repr(text[offset:offset+length]))
p.state_changed.connect(state_changed) p.state_changed.connect(state_changed)
p.saying.connect(saying) p.saying.connect(saying)