mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More work on piper TTS backend
This commit is contained in:
parent
943096857b
commit
b1688f9880
@ -6,11 +6,14 @@ import os
|
||||
import re
|
||||
import sys
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
|
||||
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QObject, QProcess, QTextToSpeech, pyqtSignal, sip
|
||||
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip
|
||||
|
||||
from calibre.constants import bundled_binaries_dir, iswindows
|
||||
from calibre.gui2.tts2.types import TTSBackend
|
||||
from calibre.spell.break_iterator import sentence_positions
|
||||
|
||||
|
||||
@lru_cache(2)
|
||||
@ -26,56 +29,79 @@ def piper_cmdline() -> tuple[str, ...]:
|
||||
return ()
|
||||
|
||||
|
||||
@dataclass
|
||||
class Utterance:
|
||||
start: int
|
||||
length: int
|
||||
payload_size: int
|
||||
left_to_write: QByteArray
|
||||
|
||||
synthesized: bool = False
|
||||
|
||||
def __init__(self, id: int):
|
||||
self.id = id
|
||||
started: bool = False
|
||||
|
||||
|
||||
class PiperIPC(QObject):
|
||||
PARAGRAPH_SEPARATOR = '\u2029'
|
||||
UTTERANCE_SEPARATOR = b'\n'
|
||||
|
||||
state_changed = pyqtSignal(QTextToSpeech.State)
|
||||
|
||||
def __init__(self, parent=None):
|
||||
def split_into_utterances(text: str, lang: str = 'en'):
|
||||
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
|
||||
for start, length in sentence_positions(text, lang):
|
||||
sentence = text[start:start+length].rstrip().replace('\n', ' ')
|
||||
payload = sentence.encode('utf-8')
|
||||
ba = QByteArray()
|
||||
ba.reserve(len(payload) + 1)
|
||||
ba.append(payload)
|
||||
ba.append(UTTERANCE_SEPARATOR)
|
||||
yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length)
|
||||
|
||||
|
||||
class Piper(TTSBackend):
|
||||
|
||||
engine_name: str = 'piper'
|
||||
|
||||
def __init__(self, engine_name: str = '', parent: QObject|None = None):
|
||||
super().__init__(parent)
|
||||
self._process: QProcess | None = None
|
||||
self._audio_sink: QAudioSink | None = None
|
||||
self._utterance_id_counter = 0
|
||||
self._utterances_in_flight: deque[Utterance] = deque()
|
||||
self._write_buf: deque[memoryview] = deque()
|
||||
self._state = QTextToSpeech.State.Ready
|
||||
self._last_error = ''
|
||||
self._errors_from_piper: list[str] = []
|
||||
self._pending_stderr_data = b''
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
|
||||
atexit.register(self.shutdown)
|
||||
|
||||
def say(self, text) -> int:
|
||||
def say(self, text: str) -> None:
|
||||
if self._last_error:
|
||||
return 0
|
||||
return
|
||||
self.stop()
|
||||
if not self.process.waitForStarted():
|
||||
cmdline = [self.process.program()] + self.process.arguments()
|
||||
if self.process.error() is QProcess.ProcessError.TimedOut:
|
||||
self._set_error(f'Timed out waiting for piper process {cmdline} to start')
|
||||
else:
|
||||
self._set_error(f'Failed to start piper process: {cmdline}')
|
||||
return 0
|
||||
import json
|
||||
self._utterance_id_counter += 1
|
||||
self._utterances_in_flight.append(Utterance(self._utterance_id_counter))
|
||||
payload = json.dumps({"text": text}).encode() + b'\n'
|
||||
self._write(payload)
|
||||
return self._utterance_id_counter
|
||||
return
|
||||
self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._write_current_utterance()
|
||||
|
||||
def pause(self):
|
||||
def pause(self) -> None:
|
||||
if self._audio_sink is not None:
|
||||
self._audio_sink.suspend()
|
||||
|
||||
def resume(self):
|
||||
def resume(self) -> None:
|
||||
if self._audio_sink is not None:
|
||||
self._audio_sink.resume()
|
||||
|
||||
def stop(self) -> None:
|
||||
if self._process is not None:
|
||||
if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight:
|
||||
self.shutdown()
|
||||
self.process
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if self._process is not None:
|
||||
self._audio_sink.stateChanged.disconnect()
|
||||
@ -110,22 +136,19 @@ class PiperIPC(QObject):
|
||||
self._last_error = msg
|
||||
self._set_state(QTextToSpeech.State.Error)
|
||||
|
||||
def _write(self, payload: bytes) -> None:
|
||||
written = self.process.write(payload)
|
||||
if written < 0:
|
||||
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
|
||||
elif written < len(payload):
|
||||
self._write_buf.append(memoryview(payload)[written:])
|
||||
|
||||
@property
|
||||
def process(self) -> QProcess:
|
||||
if self._process is None:
|
||||
self._errors_from_piper: list[str] = []
|
||||
self._utterances_in_flight.clear()
|
||||
self._errors_from_piper.clear()
|
||||
self._process = QProcess(self)
|
||||
self._pending_stderr_data = b''
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
|
||||
model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
|
||||
rate = 1.0 # TODO: Make rate configurable
|
||||
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--json-input', '--length_scale', str(rate)]
|
||||
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)]
|
||||
self._process.setProgram(cmdline[0])
|
||||
self._process.setArguments(cmdline[1:])
|
||||
self._process.readyReadStandardError.connect(self.piper_stderr_available)
|
||||
@ -163,13 +186,6 @@ class PiperIPC(QObject):
|
||||
if needs_status_update:
|
||||
self._update_status()
|
||||
|
||||
@property
|
||||
def all_synthesized(self) -> bool:
|
||||
for u in self._utterances_in_flight:
|
||||
if not u.synthesized:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _update_status(self):
|
||||
if self._process is not None and self._process.state() is QProcess.ProcessState.NotRunning:
|
||||
if self._process.exitStatus() is not QProcess.ExitStatus.NormalExit or self._process.exitCode():
|
||||
@ -178,6 +194,7 @@ class PiperIPC(QObject):
|
||||
return
|
||||
state = self._audio_sink.state()
|
||||
if state is QAudio.State.ActiveState:
|
||||
self._waiting_for_utterance_to_start = False
|
||||
self._set_state(QTextToSpeech.State.Speaking)
|
||||
elif state is QAudio.State.SuspendedState:
|
||||
self._set_state(QTextToSpeech.State.Paused)
|
||||
@ -188,29 +205,36 @@ class PiperIPC(QObject):
|
||||
if self._state is not QTextToSpeech.State.Error:
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
elif state is QAudio.State.IdleState:
|
||||
if self.all_synthesized:
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
if not self._waiting_for_utterance_to_start:
|
||||
if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized:
|
||||
self._utterances_in_flight.popleft()
|
||||
if self._utterances_in_flight:
|
||||
self._write_current_utterance()
|
||||
else:
|
||||
self._set_state(QTextToSpeech.State.Speaking)
|
||||
self._set_state(QTextToSpeech.State.Ready)
|
||||
|
||||
def bytes_written(self, count: int) -> None:
|
||||
while self._write_buf:
|
||||
payload = self._write_buf[0]
|
||||
written = self.process.write(payload)
|
||||
self._write_current_utterance()
|
||||
|
||||
def _write_current_utterance(self) -> None:
|
||||
if self._utterances_in_flight:
|
||||
u = self._utterances_in_flight[0]
|
||||
while len(u.left_to_write):
|
||||
written = self.process.write(u.left_to_write)
|
||||
if written < 0:
|
||||
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
|
||||
break
|
||||
elif written < len(payload):
|
||||
self._write_buf[0] = payload[written:]
|
||||
break
|
||||
else:
|
||||
self._write_buf.popleft()
|
||||
if not u.started and written:
|
||||
self._waiting_for_utterance_to_start = True
|
||||
u.started = True
|
||||
self.saying.emit(u.start, u.length)
|
||||
u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
|
||||
|
||||
def audio_sink_state_changed(self, state: QAudio.State) -> None:
|
||||
self._update_status()
|
||||
|
||||
|
||||
def develop():
|
||||
def develop(): # {{{
|
||||
import tty
|
||||
|
||||
from qt.core import QSocketNotifier
|
||||
@ -218,12 +242,11 @@ def develop():
|
||||
from calibre.gui2 import must_use_qt
|
||||
must_use_qt()
|
||||
app = QApplication.instance()
|
||||
p = PiperIPC()
|
||||
p = Piper()
|
||||
play_started = False
|
||||
to_play = "Yes indeed, it is a very beautiful day today."
|
||||
def state_changed(s):
|
||||
print(s, end='\r\n')
|
||||
nonlocal play_started, to_play
|
||||
nonlocal play_started
|
||||
if s is QTextToSpeech.State.Error:
|
||||
print(p.error_message(), file=sys.stderr, end='\r\n')
|
||||
app.exit(1)
|
||||
@ -231,10 +254,6 @@ def develop():
|
||||
play_started = True
|
||||
elif s is QTextToSpeech.State.Ready:
|
||||
if play_started:
|
||||
if to_play:
|
||||
p.say(to_play)
|
||||
to_play = ''
|
||||
else:
|
||||
app.quit()
|
||||
|
||||
def input_ready():
|
||||
@ -247,13 +266,19 @@ def develop():
|
||||
elif p.state is QTextToSpeech.State.Paused:
|
||||
p.resume()
|
||||
|
||||
text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!"
|
||||
|
||||
def saying(offset, length):
|
||||
print('Saying:', repr(text[offset:offset+length]), end='\r\n')
|
||||
|
||||
p.state_changed.connect(state_changed)
|
||||
p.saying.connect(saying)
|
||||
attr = tty.setraw(sys.stdin.fileno())
|
||||
os.set_blocking(sys.stdin.fileno(), False)
|
||||
sn = QSocketNotifier(sys.stdin.fileno(), QSocketNotifier.Type.Read, p)
|
||||
sn.activated.connect(input_ready)
|
||||
try:
|
||||
p.say("Hello, it is a beautiful day today, isn't it?")
|
||||
p.say(text)
|
||||
app.exec()
|
||||
finally:
|
||||
import termios
|
||||
@ -262,3 +287,4 @@ def develop():
|
||||
|
||||
if __name__ == '__main__':
|
||||
develop()
|
||||
# }}}
|
||||
|
@ -10,6 +10,7 @@ from calibre.utils.icu import _icu
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
|
||||
_iterators = {}
|
||||
_sentence_iterators = {}
|
||||
_lock = Lock()
|
||||
|
||||
|
||||
@ -20,6 +21,13 @@ def get_iterator(lang):
|
||||
return it
|
||||
|
||||
|
||||
def get_sentence_iterator(lang):
|
||||
it = _sentence_iterators.get(lang)
|
||||
if it is None:
|
||||
it = _sentence_iterators[lang] = _icu.BreakIterator(_icu.UBRK_SENTENCE, lang_as_iso639_1(lang) or lang)
|
||||
return it
|
||||
|
||||
|
||||
def split_into_words(text, lang='en'):
|
||||
with _lock:
|
||||
it = get_iterator(lang)
|
||||
@ -34,6 +42,20 @@ def split_into_words_and_positions(text, lang='en'):
|
||||
return it.split2()
|
||||
|
||||
|
||||
def sentence_positions(text, lang='en'):
|
||||
with _lock:
|
||||
it = get_sentence_iterator(lang)
|
||||
it.set_text(text)
|
||||
return it.split2()
|
||||
|
||||
|
||||
def split_into_sentences(text, lang='en'):
|
||||
with _lock:
|
||||
it = get_sentence_iterator(lang)
|
||||
it.set_text(text)
|
||||
return tuple(text[p:p+s] for p, s in it.split2())
|
||||
|
||||
|
||||
def index_of(needle, haystack, lang='en'):
|
||||
with _lock:
|
||||
it = get_iterator(lang)
|
||||
|
Loading…
x
Reference in New Issue
Block a user