mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More work on piper TTS
We now synthesize text and buffer the audio data continuously for higher performance.
This commit is contained in:
parent
376cbd9ed5
commit
316755aa1c
@ -2,20 +2,42 @@
|
||||
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import atexit
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from itertools import count
|
||||
from time import monotonic
|
||||
|
||||
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip
|
||||
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QIODevice, QIODeviceBase, QObject, QProcess, Qt, QTextToSpeech, pyqtSignal, sip
|
||||
|
||||
from calibre.constants import bundled_binaries_dir, iswindows
|
||||
from calibre.constants import bundled_binaries_dir, get_windows_username, is_debugging, iswindows
|
||||
from calibre.gui2.tts2.types import TTSBackend
|
||||
from calibre.ptempfile import base_dir
|
||||
from calibre.spell.break_iterator import sentence_positions
|
||||
|
||||
|
||||
@lru_cache(2)
|
||||
def sentinel_path() -> str:
|
||||
fname = f'piper-sentinel-{os.getpid()}'
|
||||
if iswindows:
|
||||
fname += f'-{get_windows_username()}'
|
||||
else:
|
||||
fname += f'-{os.geteuid()}'
|
||||
return os.path.join(base_dir(), fname)
|
||||
|
||||
|
||||
def debug(*a, **kw):
|
||||
if is_debugging():
|
||||
if not hasattr(debug, 'first'):
|
||||
debug.first = monotonic()
|
||||
kw['end'] = kw.get('end', '\r\n')
|
||||
print(f'[{monotonic() - debug.first:.2f}]', *a, **kw)
|
||||
|
||||
|
||||
@lru_cache(2)
|
||||
def piper_cmdline() -> tuple[str, ...]:
|
||||
ext = '.exe' if iswindows else ''
|
||||
@ -31,30 +53,109 @@ def piper_cmdline() -> tuple[str, ...]:
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
id: int
|
||||
start: int
|
||||
length: int
|
||||
payload_size: int
|
||||
left_to_write: QByteArray
|
||||
audio_data: QByteArray
|
||||
|
||||
synthesized: bool = False
|
||||
started: bool = False
|
||||
synthesized: bool = False
|
||||
|
||||
|
||||
PARAGRAPH_SEPARATOR = '\u2029'
|
||||
UTTERANCE_SEPARATOR = b'\n'
|
||||
|
||||
|
||||
def split_into_utterances(text: str, lang: str = 'en'):
|
||||
class UtteranceAudioQueue(QIODevice):
|
||||
|
||||
saying = pyqtSignal(int, int)
|
||||
update_status = pyqtSignal()
|
||||
|
||||
def __init__(self, parent: QObject | None = None):
|
||||
super().__init__(parent)
|
||||
self.utterances: deque[Utterance] = deque()
|
||||
self.current_audio_data = QByteArray()
|
||||
self.audio_state = QAudio.State.IdleState
|
||||
self.utterance_being_played: Utterance | None = None
|
||||
self.open(QIODeviceBase.OpenModeFlag.ReadOnly)
|
||||
|
||||
def audio_state_changed(self, s: QAudio.State) -> None:
|
||||
debug('Audio state:', s)
|
||||
prev_state, self.audio_state = self.audio_state, s
|
||||
if s == prev_state:
|
||||
return
|
||||
if s == QAudio.State.IdleState and prev_state == QAudio.State.ActiveState:
|
||||
if self.utterance_being_played:
|
||||
debug(f'Utterance {self.utterance_being_played.id} audio output finished')
|
||||
self.utterance_being_played = None
|
||||
self.start_utterance()
|
||||
self.update_status.emit()
|
||||
|
||||
def add_utterance(self, u: Utterance) -> None:
|
||||
self.utterances.append(u)
|
||||
if not self.utterance_being_played:
|
||||
self.start_utterance()
|
||||
|
||||
def start_utterance(self):
|
||||
if self.utterances:
|
||||
u = self.utterances.popleft()
|
||||
self.current_audio_data = u.audio_data
|
||||
self.utterance_being_played = u
|
||||
self.readyRead.emit()
|
||||
self.saying.emit(u.start, u.length)
|
||||
|
||||
def close(self):
|
||||
self.utterances.clear()
|
||||
self.current_audio_data = QByteArray()
|
||||
return super().close()
|
||||
|
||||
def clear(self):
|
||||
self.utterances.clear()
|
||||
self.current_audio_data = QByteArray()
|
||||
self.audio_state = QAudio.State.IdleState
|
||||
|
||||
def atEnd(self) -> bool:
|
||||
return not len(self.current_audio_data)
|
||||
|
||||
def bytesAvailable(self) -> int:
|
||||
return len(self.current_audio_data)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return bool(self.utterances) or self.utterance_being_played is not None
|
||||
|
||||
def isSequential(self) -> bool:
|
||||
return True
|
||||
|
||||
def seek(self, pos):
|
||||
return False
|
||||
|
||||
def readData(self, maxlen: int) -> QByteArray:
|
||||
if maxlen < 1:
|
||||
return QByteArray()
|
||||
if maxlen >= len(self.current_audio_data):
|
||||
ans = self.current_audio_data
|
||||
self.current_audio_data = QByteArray()
|
||||
else:
|
||||
ans = self.current_audio_data.first(maxlen)
|
||||
self.current_audio_data = self.current_audio_data.last(len(self.current_audio_data) - maxlen)
|
||||
if len(self.current_audio_data):
|
||||
self.readyRead.emit()
|
||||
return ans
|
||||
|
||||
|
||||
def split_into_utterances(text: str, counter: count, lang: str = 'en'):
|
||||
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
|
||||
for start, length in sentence_positions(text, lang):
|
||||
sentence = text[start:start+length].rstrip().replace('\n', ' ')
|
||||
length = len(sentence)
|
||||
payload = sentence.encode('utf-8')
|
||||
payload = json.dumps({'text': sentence}).encode('utf-8')
|
||||
ba = QByteArray()
|
||||
ba.reserve(len(payload) + 1)
|
||||
ba.append(payload)
|
||||
ba.append(UTTERANCE_SEPARATOR)
|
||||
yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length)
|
||||
yield Utterance(id=next(counter), payload_size=len(ba), audio_data=QByteArray(), left_to_write=ba, start=start, length=length)
|
||||
|
||||
|
||||
class Piper(TTSBackend):
|
||||
@ -65,12 +166,17 @@ class Piper(TTSBackend):
|
||||
super().__init__(parent)
|
||||
self._process: QProcess | None = None
|
||||
self._audio_sink: QAudioSink | None = None
|
||||
self._utterances_in_flight: deque[Utterance] = deque()
|
||||
|
||||
self._utterances_being_synthesized: deque[Utterance] = deque()
|
||||
self._utterance_counter = count(start=1)
|
||||
self._utterances_being_spoken = UtteranceAudioQueue()
|
||||
self._utterances_being_spoken.saying.connect(self.saying)
|
||||
self._utterances_being_spoken.update_status.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
|
||||
self._state = QTextToSpeech.State.Ready
|
||||
self._last_error = ''
|
||||
self._errors_from_piper: list[str] = []
|
||||
self._pending_stderr_data = b''
|
||||
self._waiting_for_utterance_to_start = False
|
||||
|
||||
self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
|
||||
atexit.register(self.shutdown)
|
||||
|
||||
@ -85,8 +191,7 @@ class Piper(TTSBackend):
|
||||
else:
|
||||
self._set_error(f'Failed to start piper process: {cmdline}')
|
||||
return
|
||||
self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._utterances_being_synthesized.extend(split_into_utterances(text, self._utterance_counter)) # TODO: Use voice language
|
||||
self._write_current_utterance()
|
||||
|
||||
def pause(self) -> None:
|
||||
@ -99,7 +204,7 @@ class Piper(TTSBackend):
|
||||
|
||||
def stop(self) -> None:
|
||||
if self._process is not None:
|
||||
if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight:
|
||||
if self._state is not QTextToSpeech.State.Ready or self._utterances_being_synthesized or self._utterances_being_spoken:
|
||||
self.shutdown()
|
||||
self.process
|
||||
|
||||
@ -111,6 +216,7 @@ class Piper(TTSBackend):
|
||||
# self._audio_sink.stop()
|
||||
self._process.readyReadStandardError.disconnect()
|
||||
self._process.bytesWritten.disconnect()
|
||||
self._process.readyReadStandardOutput.disconnect()
|
||||
# self._process.stateChanged.disconnect()
|
||||
self._process.kill()
|
||||
self._process.waitForFinished(-1)
|
||||
@ -140,19 +246,21 @@ class Piper(TTSBackend):
|
||||
@property
|
||||
def process(self) -> QProcess:
|
||||
if self._process is None:
|
||||
self._utterances_in_flight.clear()
|
||||
self._utterances_being_spoken.clear()
|
||||
self._utterances_being_synthesized.clear()
|
||||
self._errors_from_piper.clear()
|
||||
self._process = QProcess(self)
|
||||
self._pending_stderr_data = b''
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
|
||||
model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
|
||||
rate = 1.0 # TODO: Make rate configurable
|
||||
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)]
|
||||
cmdline = list(piper_cmdline()) + [
|
||||
'--model', model_path, '--output-raw', '--json-input', '--sentence-silence', '0', '--length_scale', str(rate)]
|
||||
self._process.setProgram(cmdline[0])
|
||||
self._process.setArguments(cmdline[1:])
|
||||
self._process.readyReadStandardError.connect(self.piper_stderr_available)
|
||||
self._process.readyReadStandardError.connect(self.piper_stderr_available, type=Qt.ConnectionType.QueuedConnection)
|
||||
self._process.readyReadStandardOutput.connect(self.piper_stdout_available)
|
||||
self._process.bytesWritten.connect(self.bytes_written)
|
||||
# See https://www.riverbankcomputing.com/pipermail/pyqt/2024-September/046002.html
|
||||
# self._process.stateChanged.connect(self._update_status)
|
||||
@ -161,11 +269,20 @@ class Piper(TTSBackend):
|
||||
fmt.setSampleRate(22050) # TODO: Read this from voice JSON
|
||||
fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono)
|
||||
self._audio_sink = QAudioSink(fmt, self) # TODO: Make audio device configurable
|
||||
self._audio_sink.stateChanged.connect(self.audio_sink_state_changed)
|
||||
self._audio_sink.stateChanged.connect(self._utterances_being_spoken.audio_state_changed)
|
||||
self._process.start()
|
||||
self._audio_sink.start(self._process)
|
||||
self._audio_sink.start(self._utterances_being_spoken)
|
||||
return self._process
|
||||
|
||||
def piper_stdout_available(self) -> None:
|
||||
if self._utterances_being_synthesized:
|
||||
u = self._utterances_being_synthesized[0]
|
||||
while True:
|
||||
ba = self.process.readAll()
|
||||
if not len(ba):
|
||||
break
|
||||
u.audio_data.append(ba)
|
||||
|
||||
def piper_stderr_available(self) -> None:
|
||||
needs_status_update = False
|
||||
if self._process is not None:
|
||||
@ -175,12 +292,13 @@ class Piper(TTSBackend):
|
||||
if m := self._stderr_pat.search(line):
|
||||
which, payload = m.group(1), m.group(2)
|
||||
if which == b'info':
|
||||
if payload.startswith(b'Real-time factor:'):
|
||||
for u in self._utterances_in_flight:
|
||||
if not u.synthesized:
|
||||
u.synthesized = True
|
||||
needs_status_update = True
|
||||
break
|
||||
if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized:
|
||||
u = self._utterances_being_synthesized.popleft()
|
||||
u.synthesized = True
|
||||
debug(f'Utterance {u.id} synthesized')
|
||||
needs_status_update = True
|
||||
self._utterances_being_spoken.add_utterance(u)
|
||||
self._write_current_utterance()
|
||||
elif which == b'error':
|
||||
self._errors_from_piper.append(payload.decode('utf-8', 'replace'))
|
||||
self._pending_stderr_data = lines[-1]
|
||||
@ -193,9 +311,10 @@ class Piper(TTSBackend):
|
||||
m = '\n'.join(self._errors_from_piper)
|
||||
self._set_error(f'piper process failed with exit code: {self._process.exitCode()} and error messages: {m}')
|
||||
return
|
||||
state = self._audio_sink.state()
|
||||
if self._state is QTextToSpeech.State.Error:
|
||||
return
|
||||
state = self._utterances_being_spoken.audio_state
|
||||
if state is QAudio.State.ActiveState:
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._set_state(QTextToSpeech.State.Speaking)
|
||||
elif state is QAudio.State.SuspendedState:
|
||||
self._set_state(QTextToSpeech.State.Paused)
|
||||
@ -206,29 +325,23 @@ class Piper(TTSBackend):
|
||||
if self._state is not QTextToSpeech.State.Error:
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
elif state is QAudio.State.IdleState:
|
||||
if not self._waiting_for_utterance_to_start:
|
||||
if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized:
|
||||
self._utterances_in_flight.popleft()
|
||||
if self._utterances_in_flight:
|
||||
self._write_current_utterance()
|
||||
else:
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
if not self._utterances_being_synthesized and not self._utterances_being_spoken:
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
|
||||
def bytes_written(self, count: int) -> None:
|
||||
self._write_current_utterance()
|
||||
|
||||
def _write_current_utterance(self) -> None:
|
||||
if self._utterances_in_flight:
|
||||
u = self._utterances_in_flight[0]
|
||||
if self._utterances_being_synthesized:
|
||||
u = self._utterances_being_synthesized[0]
|
||||
while len(u.left_to_write):
|
||||
written = self.process.write(u.left_to_write)
|
||||
if written < 0:
|
||||
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
|
||||
break
|
||||
if not u.started and written:
|
||||
self._waiting_for_utterance_to_start = True
|
||||
u.started = True
|
||||
self.saying.emit(u.start, u.length)
|
||||
debug(f'Utterance {u.id} synthesis started')
|
||||
u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
|
||||
|
||||
def audio_sink_state_changed(self, state: QAudio.State) -> None:
|
||||
@ -246,10 +359,10 @@ def develop(): # {{{
|
||||
p = Piper()
|
||||
play_started = False
|
||||
def state_changed(s):
|
||||
print(s, end='\r\n')
|
||||
debug('TTS State:', s)
|
||||
nonlocal play_started
|
||||
if s is QTextToSpeech.State.Error:
|
||||
print(p.error_message(), file=sys.stderr, end='\r\n')
|
||||
debug(p.error_message(), file=sys.stderr)
|
||||
app.exit(1)
|
||||
elif s is QTextToSpeech.State.Speaking:
|
||||
play_started = True
|
||||
@ -267,10 +380,14 @@ def develop(): # {{{
|
||||
elif p.state is QTextToSpeech.State.Paused:
|
||||
p.resume()
|
||||
|
||||
text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!"
|
||||
text = (
|
||||
'First, relatively short sentence. '
|
||||
'Second, much longer sentence which hopefully finishes synthesizing before the first finishes speaking. '
|
||||
'Third, and final short sentence.'
|
||||
)
|
||||
|
||||
def saying(offset, length):
|
||||
print('Saying:', repr(text[offset:offset+length]), end='\r\n')
|
||||
debug('Saying:', repr(text[offset:offset+length]))
|
||||
|
||||
p.state_changed.connect(state_changed)
|
||||
p.saying.connect(saying)
|
||||
|
Loading…
x
Reference in New Issue
Block a user