More work on piper TTS

We now synthesize text and buffer the audio data continuously for higher
performance.
This commit is contained in:
Kovid Goyal 2024-09-01 20:23:17 +05:30
parent 376cbd9ed5
commit 316755aa1c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,20 +2,42 @@
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import atexit
import json
import os
import re
import sys
from collections import deque
from dataclasses import dataclass
from functools import lru_cache
from itertools import count
from time import monotonic
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip
from calibre.constants import bundled_binaries_dir, iswindows
from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows
from calibre.gui2.tts2.types import TTSBackend
from calibre.ptempfile import base_dir
from calibre.spell.break_iterator import sentence_positions
@lru_cache(2)
def sentinel_path() -> str:
fname = f'piper-sentinel-{os.getpid()}'
if iswindows:
fname += f'-{get_windows_username()}'
else:
fname += f'-{os.geteuid()}'
return os.path.join(base_dir(), fname)
def debug(*a, **kw):
if is_debugging():
if not hasattr(debug, 'first'):
debug.first = monotonic()
kw['end'] = kw.get('end', '\r\n')
print(f'[{monotonic() - debug.first:.2f}]', *a, **kw)
@lru_cache(2)
def piper_cmdline() -> tuple[str, ...]:
ext = '.exe' if iswindows else ''
@ -31,30 +53,109 @@ def piper_cmdline() -> tuple[str, ...]:
@dataclass
class Utterance:
id: int
start: int
length: int
payload_size: int
left_to_write: QByteArray
audio_data: QByteArray
synthesized: bool = False
started: bool = False
synthesized: bool = False
PARAGRAPH_SEPARATOR = '\u2029'
UTTERANCE_SEPARATOR = b'\n'
def split_into_utterances(text: str, lang: str = 'en'):
class UtteranceAudioQueue(QIODevice):
saying = pyqtSignal(int, int)
update_status = pyqtSignal()
def __init__(self, parent: QObject | None = None):
super().__init__(parent)
self.utterances: deque[Utterance] = deque()
self.current_audio_data = QByteArray()
self.audio_state = QAudio.State.IdleState
self.utterance_being_played: Utterance | None = None
self.open(QIODeviceBase.OpenModeFlag.ReadOnly)
def audio_state_changed(self, s: QAudio.State) -> None:
debug('Audio state:', s)
prev_state, self.audio_state = self.audio_state, s
if s == prev_state:
return
if s == QAudio.State.IdleState and prev_state == QAudio.State.ActiveState:
if self.utterance_being_played:
debug(f'Utterance {self.utterance_being_played.id} audio output finished')
self.utterance_being_played = None
self.start_utterance()
self.update_status.emit()
def add_utterance(self, u: Utterance) -> None:
self.utterances.append(u)
if not self.utterance_being_played:
self.start_utterance()
def start_utterance(self):
if self.utterances:
u = self.utterances.popleft()
self.current_audio_data = u.audio_data
self.utterance_being_played = u
self.readyRead.emit()
self.saying.emit(u.start, u.length)
def close(self):
self.utterances.clear()
self.current_audio_data = QByteArray()
return super().close()
def clear(self):
self.utterances.clear()
self.current_audio_data = QByteArray()
self.audio_state = QAudio.State.IdleState
def atEnd(self) -> bool:
return not len(self.current_audio_data)
def bytesAvailable(self) -> int:
return len(self.current_audio_data)
def __bool__(self) -> bool:
return bool(self.utterances) or self.utterance_being_played is not None
def isSequential(self) -> bool:
return True
def seek(self, pos):
return False
def readData(self, maxlen: int) -> QByteArray:
if maxlen < 1:
return QByteArray()
if maxlen >= len(self.current_audio_data):
ans = self.current_audio_data
self.current_audio_data = QByteArray()
else:
ans = self.current_audio_data.first(maxlen)
self.current_audio_data = self.current_audio_data.last(len(self.current_audio_data) - maxlen)
if len(self.current_audio_data):
self.readyRead.emit()
return ans
def split_into_utterances(text: str, counter: count, lang: str = 'en'):
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
for start, length in sentence_positions(text, lang):
sentence = text[start:start+length].rstrip().replace('\n', ' ')
length = len(sentence)
payload = sentence.encode('utf-8')
payload = json.dumps({'text': sentence}).encode('utf-8')
ba = QByteArray()
ba.reserve(len(payload) + 1)
ba.append(payload)
ba.append(UTTERANCE_SEPARATOR)
yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length)
yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length)
class Piper(TTSBackend):
@ -65,12 +166,17 @@ class Piper(TTSBackend):
super().__init__(parent)
self._process: QProcess | None = None
self._audio_sink: QAudioSink | None = None
self._utterances_in_flight: deque[Utterance] = deque()
self._utterances_being_synthesized: deque[Utterance] = deque()
self._utterance_counter = count(start=1)
self._utterances_being_spoken = UtteranceAudioQueue()
self._utterances_being_spoken.saying.connect(self.saying)
self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
self._state = QTextToSpeech.State.Ready
self._last_error = ''
self._errors_from_piper: list[str] = []
self._pending_stderr_data = b''
self._waiting_for_utterance_to_start = False
self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
atexit.register(self.shutdown)
@ -85,8 +191,7 @@ class Piper(TTSBackend):
else:
self._set_error(f'Failed to start piper process: {cmdline}')
return
self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language
self._waiting_for_utterance_to_start = False
self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language
self._write_current_utterance()
def pause(self) -> None:
@ -99,7 +204,7 @@ class Piper(TTSBackend):
def stop(self) -> None:
if self._process is not None:
if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight:
if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken:
self.shutdown()
self.process
@ -111,6 +216,7 @@ class Piper(TTSBackend):
# self._audio_sink.stop()
self._process.readyReadStandardError.disconnect()
self._process.bytesWritten.disconnect()
self._process.readyReadStandardOutput.disconnect()
# self._process.stateChanged.disconnect()
self._process.kill()
self._process.waitForFinished(-1)
@ -140,19 +246,21 @@ class Piper(TTSBackend):
@property
def process(self) -> QProcess:
if self._process is None:
self._utterances_in_flight.clear()
self._utterances_being_spoken.clear()
self._utterances_being_synthesized.clear()
self._errors_from_piper.clear()
self._process = QProcess(self)
self._pending_stderr_data = b''
self._waiting_for_utterance_to_start = False
self._set_state(QTextToSpeech.State.Ready)
model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
rate = 1.0 # TODO: Make rate configurable
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)]
cmdline = list(piper_cmdline()) + [
'--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)]
self._process.setProgram(cmdline[0])
self._process.setArguments(cmdline[1:])
self._process.readyReadStandardError.connect(self.piper_stderr_available)
self._process.readyReadStandardError.connect(self.piper_stderr_available, type=Qt.ConnectionType.QueuedConnection)
self._process.readyReadStandardOutput.connect(self.piper_stdout_available)
self._process.bytesWritten.connect(self.bytes_written)
# See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html
# self._process.stateChanged.connect(self._update_status)
@ -161,11 +269,20 @@ class Piper(TTSBackend):
fmt.setSampleRate(22050) # TODO: Read this from voice JSON
fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
self._audio_sink = QAudioSink(fmt, self) # TODO: Make audio device configurable
self._audio_sink.stateChanged.connect(self.audio_sink_state_changed)
self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
self._process.start()
self._audio_sink.start(self._process)
self._audio_sink.start(self._utterances_being_spoken)
return self._process
def piper_stdout_available(self) -> None:
if self._utterances_being_synthesized:
u = self._utterances_being_synthesized[0]
while True:
ba = self.process.readAll()
if not len(ba):
break
u.audio_data.append(ba)
def piper_stderr_available(self) -> None:
needs_status_update = False
if self._process is not None:
@ -175,12 +292,13 @@ class Piper(TTSBackend):
if m := self._stderr_pat.search(line):
which, payload = m.group(1), m.group(2)
if which == b'info':
if payload.startswith(b'Real-time factor:'):
for u in self._utterances_in_flight:
if not u.synthesized:
u.synthesized = True
needs_status_update = True
break
if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized:
u = self._utterances_being_synthesized.popleft()
u.synthesized = True
debug(f'Utterance {u.id} synthesized')
needs_status_update = True
self._utterances_being_spoken.add_utterance(u)
self._write_current_utterance()
elif which == b'error':
self._errors_from_piper.append(payload.decode('utf-8', 'replace'))
self._pending_stderr_data = lines[-1]
@ -193,9 +311,10 @@ class Piper(TTSBackend):
m = '\n'.join(self._errors_from_piper)
self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}')
return
state = self._audio_sink.state()
if self._state is QTextToSpeech.State.Error:
return
state = self._utterances_being_spoken.audio_state
if state is QAudio.State.ActiveState:
self._waiting_for_utterance_to_start = False
self._set_state(QTextToSpeech.State.Speaking)
elif state is QAudio.State.SuspendedState:
self._set_state(QTextToSpeech.State.Paused)
@ -206,29 +325,23 @@ class Piper(TTSBackend):
if self._state is not QTextToSpeech.State.Error:
self._set_state(QTextToSpeech.State.Ready)
elif state is QAudio.State.IdleState:
if not self._waiting_for_utterance_to_start:
if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized:
self._utterances_in_flight.popleft()
if self._utterances_in_flight:
self._write_current_utterance()
else:
self._set_state(QTextToSpeech.State.Ready)
if not self._utterances_being_synthesized and not self._utterances_being_spoken:
self._set_state(QTextToSpeech.State.Ready)
def bytes_written(self, count: int) -> None:
self._write_current_utterance()
def _write_current_utterance(self) -> None:
if self._utterances_in_flight:
u = self._utterances_in_flight[0]
if self._utterances_being_synthesized:
u = self._utterances_being_synthesized[0]
while len(u.left_to_write):
written = self.process.write(u.left_to_write)
if written < 0:
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
break
if not u.started and written:
self._waiting_for_utterance_to_start = True
u.started = True
self.saying.emit(u.start, u.length)
debug(f'Utterance {u.id} synthesis started')
u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
def audio_sink_state_changed(self, state: QAudio.State) -> None:
@ -246,10 +359,10 @@ def develop(): # {{{
p = Piper()
play_started = False
def state_changed(s):
print(s, end='\r\n')
debug('TTS State:', s)
nonlocal play_started
if s is QTextToSpeech.State.Error:
print(p.error_message(), file=sys.stderr, end='\r\n')
debug(p.error_message(), file=sys.stderr)
app.exit(1)
elif s is QTextToSpeech.State.Speaking:
play_started = True
@ -267,10 +380,14 @@ def develop(): # {{{
elif p.state is QTextToSpeech.State.Paused:
p.resume()
text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!"
text = (
'First, relatively short sentence. '
'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. '
'Third, and final short sentence.'
)
def saying(offset, length):
print('Saying:', repr(text[offset:offset+length]), end='\r\n')
debug('Saying:', repr(text[offset:offset+length]))
p.state_changed.connect(state_changed)
p.saying.connect(saying)