mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More work on piper TTS backend
This commit is contained in:
parent
943096857b
commit
b1688f9880
@ -6,11 +6,14 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
from dataclasses import dataclass
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QObject, QProcess, QTextToSpeech, pyqtSignal, sip
|
from qt.core import QApplication, QAudio, QAudioFormat, QAudioSink, QByteArray, QObject, QProcess, QTextToSpeech, sip
|
||||||
|
|
||||||
from calibre.constants import bundled_binaries_dir, iswindows
|
from calibre.constants import bundled_binaries_dir, iswindows
|
||||||
|
from calibre.gui2.tts2.types import TTSBackend
|
||||||
|
from calibre.spell.break_iterator import sentence_positions
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(2)
|
@lru_cache(2)
|
||||||
@ -26,56 +29,79 @@ def piper_cmdline() -> tuple[str, ...]:
|
|||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
class Utterance:
|
class Utterance:
|
||||||
|
start: int
|
||||||
|
length: int
|
||||||
|
payload_size: int
|
||||||
|
left_to_write: QByteArray
|
||||||
|
|
||||||
synthesized: bool = False
|
synthesized: bool = False
|
||||||
|
started: bool = False
|
||||||
def __init__(self, id: int):
|
|
||||||
self.id = id
|
|
||||||
|
|
||||||
|
|
||||||
class PiperIPC(QObject):
|
PARAGRAPH_SEPARATOR = '\u2029'
|
||||||
|
UTTERANCE_SEPARATOR = b'\n'
|
||||||
|
|
||||||
state_changed = pyqtSignal(QTextToSpeech.State)
|
|
||||||
|
|
||||||
def __init__(self, parent=None):
|
def split_into_utterances(text: str, lang: str = 'en'):
|
||||||
|
text = re.sub(r'\n{2,}', PARAGRAPH_SEPARATOR, text.replace('\r', '')).replace('\n', ' ')
|
||||||
|
for start, length in sentence_positions(text, lang):
|
||||||
|
sentence = text[start:start+length].rstrip().replace('\n', ' ')
|
||||||
|
payload = sentence.encode('utf-8')
|
||||||
|
ba = QByteArray()
|
||||||
|
ba.reserve(len(payload) + 1)
|
||||||
|
ba.append(payload)
|
||||||
|
ba.append(UTTERANCE_SEPARATOR)
|
||||||
|
yield Utterance(payload_size=len(ba), left_to_write=ba, start=start, length=length)
|
||||||
|
|
||||||
|
|
||||||
|
class Piper(TTSBackend):
|
||||||
|
|
||||||
|
engine_name: str = 'piper'
|
||||||
|
|
||||||
|
def __init__(self, engine_name: str = '', parent: QObject|None = None):
|
||||||
super().__init__(parent)
|
super().__init__(parent)
|
||||||
self._process: QProcess | None = None
|
self._process: QProcess | None = None
|
||||||
self._audio_sink: QAudioSink | None = None
|
self._audio_sink: QAudioSink | None = None
|
||||||
self._utterance_id_counter = 0
|
|
||||||
self._utterances_in_flight: deque[Utterance] = deque()
|
self._utterances_in_flight: deque[Utterance] = deque()
|
||||||
self._write_buf: deque[memoryview] = deque()
|
|
||||||
self._state = QTextToSpeech.State.Ready
|
self._state = QTextToSpeech.State.Ready
|
||||||
self._last_error = ''
|
self._last_error = ''
|
||||||
self._errors_from_piper: list[str] = []
|
self._errors_from_piper: list[str] = []
|
||||||
self._pending_stderr_data = b''
|
self._pending_stderr_data = b''
|
||||||
|
self._waiting_for_utterance_to_start = False
|
||||||
self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
|
self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)')
|
||||||
atexit.register(self.shutdown)
|
atexit.register(self.shutdown)
|
||||||
|
|
||||||
def say(self, text) -> int:
|
def say(self, text: str) -> None:
|
||||||
if self._last_error:
|
if self._last_error:
|
||||||
return 0
|
return
|
||||||
|
self.stop()
|
||||||
if not self.process.waitForStarted():
|
if not self.process.waitForStarted():
|
||||||
cmdline = [self.process.program()] + self.process.arguments()
|
cmdline = [self.process.program()] + self.process.arguments()
|
||||||
if self.process.error() is QProcess.ProcessError.TimedOut:
|
if self.process.error() is QProcess.ProcessError.TimedOut:
|
||||||
self._set_error(f'Timed out waiting for piper process {cmdline} to start')
|
self._set_error(f'Timed out waiting for piper process {cmdline} to start')
|
||||||
else:
|
else:
|
||||||
self._set_error(f'Failed to start piper process: {cmdline}')
|
self._set_error(f'Failed to start piper process: {cmdline}')
|
||||||
return 0
|
return
|
||||||
import json
|
self._utterances_in_flight.extend(split_into_utterances(text)) # TODO: Use voice language
|
||||||
self._utterance_id_counter += 1
|
self._waiting_for_utterance_to_start = False
|
||||||
self._utterances_in_flight.append(Utterance(self._utterance_id_counter))
|
self._write_current_utterance()
|
||||||
payload = json.dumps({"text": text}).encode() + b'\n'
|
|
||||||
self._write(payload)
|
|
||||||
return self._utterance_id_counter
|
|
||||||
|
|
||||||
def pause(self):
|
def pause(self) -> None:
|
||||||
if self._audio_sink is not None:
|
if self._audio_sink is not None:
|
||||||
self._audio_sink.suspend()
|
self._audio_sink.suspend()
|
||||||
|
|
||||||
def resume(self):
|
def resume(self) -> None:
|
||||||
if self._audio_sink is not None:
|
if self._audio_sink is not None:
|
||||||
self._audio_sink.resume()
|
self._audio_sink.resume()
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
if self._process is not None:
|
||||||
|
if self._state is not QTextToSpeech.State.Ready or self._utterances_in_flight:
|
||||||
|
self.shutdown()
|
||||||
|
self.process
|
||||||
|
|
||||||
def shutdown(self) -> None:
|
def shutdown(self) -> None:
|
||||||
if self._process is not None:
|
if self._process is not None:
|
||||||
self._audio_sink.stateChanged.disconnect()
|
self._audio_sink.stateChanged.disconnect()
|
||||||
@ -110,22 +136,19 @@ class PiperIPC(QObject):
|
|||||||
self._last_error = msg
|
self._last_error = msg
|
||||||
self._set_state(QTextToSpeech.State.Error)
|
self._set_state(QTextToSpeech.State.Error)
|
||||||
|
|
||||||
def _write(self, payload: bytes) -> None:
|
|
||||||
written = self.process.write(payload)
|
|
||||||
if written < 0:
|
|
||||||
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
|
|
||||||
elif written < len(payload):
|
|
||||||
self._write_buf.append(memoryview(payload)[written:])
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def process(self) -> QProcess:
|
def process(self) -> QProcess:
|
||||||
if self._process is None:
|
if self._process is None:
|
||||||
self._errors_from_piper: list[str] = []
|
self._utterances_in_flight.clear()
|
||||||
|
self._errors_from_piper.clear()
|
||||||
self._process = QProcess(self)
|
self._process = QProcess(self)
|
||||||
self._pending_stderr_data = b''
|
self._pending_stderr_data = b''
|
||||||
|
self._waiting_for_utterance_to_start = False
|
||||||
|
self._set_state(QTextToSpeech.State.Ready)
|
||||||
|
|
||||||
model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
|
model_path = '/t/en_US-libritts-high.onnx' # TODO: Dont hardcode voice
|
||||||
rate = 1.0 # TODO: Make rate configurable
|
rate = 1.0 # TODO: Make rate configurable
|
||||||
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--json-input', '--length_scale', str(rate)]
|
cmdline = list(piper_cmdline()) + ['--model', model_path, '--output-raw', '--length_scale', str(rate)]
|
||||||
self._process.setProgram(cmdline[0])
|
self._process.setProgram(cmdline[0])
|
||||||
self._process.setArguments(cmdline[1:])
|
self._process.setArguments(cmdline[1:])
|
||||||
self._process.readyReadStandardError.connect(self.piper_stderr_available)
|
self._process.readyReadStandardError.connect(self.piper_stderr_available)
|
||||||
@ -163,13 +186,6 @@ class PiperIPC(QObject):
|
|||||||
if needs_status_update:
|
if needs_status_update:
|
||||||
self._update_status()
|
self._update_status()
|
||||||
|
|
||||||
@property
|
|
||||||
def all_synthesized(self) -> bool:
|
|
||||||
for u in self._utterances_in_flight:
|
|
||||||
if not u.synthesized:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _update_status(self):
|
def _update_status(self):
|
||||||
if self._process is not None and self._process.state() is QProcess.ProcessState.NotRunning:
|
if self._process is not None and self._process.state() is QProcess.ProcessState.NotRunning:
|
||||||
if self._process.exitStatus() is not QProcess.ExitStatus.NormalExit or self._process.exitCode():
|
if self._process.exitStatus() is not QProcess.ExitStatus.NormalExit or self._process.exitCode():
|
||||||
@ -178,6 +194,7 @@ class PiperIPC(QObject):
|
|||||||
return
|
return
|
||||||
state = self._audio_sink.state()
|
state = self._audio_sink.state()
|
||||||
if state is QAudio.State.ActiveState:
|
if state is QAudio.State.ActiveState:
|
||||||
|
self._waiting_for_utterance_to_start = False
|
||||||
self._set_state(QTextToSpeech.State.Speaking)
|
self._set_state(QTextToSpeech.State.Speaking)
|
||||||
elif state is QAudio.State.SuspendedState:
|
elif state is QAudio.State.SuspendedState:
|
||||||
self._set_state(QTextToSpeech.State.Paused)
|
self._set_state(QTextToSpeech.State.Paused)
|
||||||
@ -188,29 +205,36 @@ class PiperIPC(QObject):
|
|||||||
if self._state is not QTextToSpeech.State.Error:
|
if self._state is not QTextToSpeech.State.Error:
|
||||||
self._set_state(QTextToSpeech.State.Ready)
|
self._set_state(QTextToSpeech.State.Ready)
|
||||||
elif state is QAudio.State.IdleState:
|
elif state is QAudio.State.IdleState:
|
||||||
if self.all_synthesized:
|
if not self._waiting_for_utterance_to_start:
|
||||||
self._set_state(QTextToSpeech.State.Ready)
|
if self._utterances_in_flight and (u := self._utterances_in_flight[0]) and u.synthesized:
|
||||||
else:
|
self._utterances_in_flight.popleft()
|
||||||
self._set_state(QTextToSpeech.State.Speaking)
|
if self._utterances_in_flight:
|
||||||
|
self._write_current_utterance()
|
||||||
|
else:
|
||||||
|
self._set_state(QTextToSpeech.State.Ready)
|
||||||
|
|
||||||
def bytes_written(self, count: int) -> None:
|
def bytes_written(self, count: int) -> None:
|
||||||
while self._write_buf:
|
self._write_current_utterance()
|
||||||
payload = self._write_buf[0]
|
|
||||||
written = self.process.write(payload)
|
def _write_current_utterance(self) -> None:
|
||||||
if written < 0:
|
if self._utterances_in_flight:
|
||||||
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
|
u = self._utterances_in_flight[0]
|
||||||
break
|
while len(u.left_to_write):
|
||||||
elif written < len(payload):
|
written = self.process.write(u.left_to_write)
|
||||||
self._write_buf[0] = payload[written:]
|
if written < 0:
|
||||||
break
|
self._set_error('Failed to write to piper process with error: {self.process.errorString()}')
|
||||||
else:
|
break
|
||||||
self._write_buf.popleft()
|
if not u.started and written:
|
||||||
|
self._waiting_for_utterance_to_start = True
|
||||||
|
u.started = True
|
||||||
|
self.saying.emit(u.start, u.length)
|
||||||
|
u.left_to_write = u.left_to_write.last(len(u.left_to_write) - written)
|
||||||
|
|
||||||
def audio_sink_state_changed(self, state: QAudio.State) -> None:
|
def audio_sink_state_changed(self, state: QAudio.State) -> None:
|
||||||
self._update_status()
|
self._update_status()
|
||||||
|
|
||||||
|
|
||||||
def develop():
|
def develop(): # {{{
|
||||||
import tty
|
import tty
|
||||||
|
|
||||||
from qt.core import QSocketNotifier
|
from qt.core import QSocketNotifier
|
||||||
@ -218,12 +242,11 @@ def develop():
|
|||||||
from calibre.gui2 import must_use_qt
|
from calibre.gui2 import must_use_qt
|
||||||
must_use_qt()
|
must_use_qt()
|
||||||
app = QApplication.instance()
|
app = QApplication.instance()
|
||||||
p = PiperIPC()
|
p = Piper()
|
||||||
play_started = False
|
play_started = False
|
||||||
to_play = "Yes indeed, it is a very beautiful day today."
|
|
||||||
def state_changed(s):
|
def state_changed(s):
|
||||||
print(s, end='\r\n')
|
print(s, end='\r\n')
|
||||||
nonlocal play_started, to_play
|
nonlocal play_started
|
||||||
if s is QTextToSpeech.State.Error:
|
if s is QTextToSpeech.State.Error:
|
||||||
print(p.error_message(), file=sys.stderr, end='\r\n')
|
print(p.error_message(), file=sys.stderr, end='\r\n')
|
||||||
app.exit(1)
|
app.exit(1)
|
||||||
@ -231,11 +254,7 @@ def develop():
|
|||||||
play_started = True
|
play_started = True
|
||||||
elif s is QTextToSpeech.State.Ready:
|
elif s is QTextToSpeech.State.Ready:
|
||||||
if play_started:
|
if play_started:
|
||||||
if to_play:
|
app.quit()
|
||||||
p.say(to_play)
|
|
||||||
to_play = ''
|
|
||||||
else:
|
|
||||||
app.quit()
|
|
||||||
|
|
||||||
def input_ready():
|
def input_ready():
|
||||||
q = sys.stdin.buffer.read()
|
q = sys.stdin.buffer.read()
|
||||||
@ -247,13 +266,19 @@ def develop():
|
|||||||
elif p.state is QTextToSpeech.State.Paused:
|
elif p.state is QTextToSpeech.State.Paused:
|
||||||
p.resume()
|
p.resume()
|
||||||
|
|
||||||
|
text = "Hello, it is a beautiful day today, isn't it? Yes indeed, it is a very beautiful day!"
|
||||||
|
|
||||||
|
def saying(offset, length):
|
||||||
|
print('Saying:', repr(text[offset:offset+length]), end='\r\n')
|
||||||
|
|
||||||
p.state_changed.connect(state_changed)
|
p.state_changed.connect(state_changed)
|
||||||
|
p.saying.connect(saying)
|
||||||
attr = tty.setraw(sys.stdin.fileno())
|
attr = tty.setraw(sys.stdin.fileno())
|
||||||
os.set_blocking(sys.stdin.fileno(), False)
|
os.set_blocking(sys.stdin.fileno(), False)
|
||||||
sn = QSocketNotifier(sys.stdin.fileno(), QSocketNotifier.Type.Read, p)
|
sn = QSocketNotifier(sys.stdin.fileno(), QSocketNotifier.Type.Read, p)
|
||||||
sn.activated.connect(input_ready)
|
sn.activated.connect(input_ready)
|
||||||
try:
|
try:
|
||||||
p.say("Hello, it is a beautiful day today, isn't it?")
|
p.say(text)
|
||||||
app.exec()
|
app.exec()
|
||||||
finally:
|
finally:
|
||||||
import termios
|
import termios
|
||||||
@ -262,3 +287,4 @@ def develop():
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
develop()
|
develop()
|
||||||
|
# }}}
|
||||||
|
@ -10,6 +10,7 @@ from calibre.utils.icu import _icu
|
|||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
|
|
||||||
_iterators = {}
|
_iterators = {}
|
||||||
|
_sentence_iterators = {}
|
||||||
_lock = Lock()
|
_lock = Lock()
|
||||||
|
|
||||||
|
|
||||||
@ -20,6 +21,13 @@ def get_iterator(lang):
|
|||||||
return it
|
return it
|
||||||
|
|
||||||
|
|
||||||
|
def get_sentence_iterator(lang):
|
||||||
|
it = _sentence_iterators.get(lang)
|
||||||
|
if it is None:
|
||||||
|
it = _sentence_iterators[lang] = _icu.BreakIterator(_icu.UBRK_SENTENCE, lang_as_iso639_1(lang) or lang)
|
||||||
|
return it
|
||||||
|
|
||||||
|
|
||||||
def split_into_words(text, lang='en'):
|
def split_into_words(text, lang='en'):
|
||||||
with _lock:
|
with _lock:
|
||||||
it = get_iterator(lang)
|
it = get_iterator(lang)
|
||||||
@ -34,6 +42,20 @@ def split_into_words_and_positions(text, lang='en'):
|
|||||||
return it.split2()
|
return it.split2()
|
||||||
|
|
||||||
|
|
||||||
|
def sentence_positions(text, lang='en'):
|
||||||
|
with _lock:
|
||||||
|
it = get_sentence_iterator(lang)
|
||||||
|
it.set_text(text)
|
||||||
|
return it.split2()
|
||||||
|
|
||||||
|
|
||||||
|
def split_into_sentences(text, lang='en'):
|
||||||
|
with _lock:
|
||||||
|
it = get_sentence_iterator(lang)
|
||||||
|
it.set_text(text)
|
||||||
|
return tuple(text[p:p+s] for p, s in it.split2())
|
||||||
|
|
||||||
|
|
||||||
def index_of(needle, haystack, lang='en'):
|
def index_of(needle, haystack, lang='en'):
|
||||||
with _lock:
|
with _lock:
|
||||||
it = get_iterator(lang)
|
it = get_iterator(lang)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user