diff --git a/src/calibre/gui2/tts/download.py b/src/calibre/gui2/tts/download.py index 0778351c0e..55ac5b5c41 100644 --- a/src/calibre/gui2/tts/download.py +++ b/src/calibre/gui2/tts/download.py @@ -171,9 +171,9 @@ def download_resources( from calibre import browser print(title) print(message) + br = browser() for url, (path, name) in urls.items(): print(_('Downloading {}...').format(name)) - br = browser() data = br.open_novisit(url).read() with open(path, 'wb') as f: f.write(data) diff --git a/src/calibre/gui2/tts/piper.py b/src/calibre/gui2/tts/piper.py index 09291e8075..50d67e1104 100644 --- a/src/calibre/gui2/tts/piper.py +++ b/src/calibre/gui2/tts/piper.py @@ -9,11 +9,14 @@ import sys from collections import deque from contextlib import suppress from dataclasses import dataclass +from functools import lru_cache from itertools import count +from queue import Empty, Queue +from threading import Event from time import monotonic +from typing import BinaryIO, Iterable, Iterator from qt.core import ( - QApplication, QAudio, QAudioFormat, QAudioSink, @@ -32,7 +35,7 @@ from qt.core import ( from calibre.constants import cache_dir, is_debugging, piper_cmdline from calibre.gui2 import error_dialog -from calibre.gui2.tts.types import EngineSpecificSettings, Quality, TTSBackend, Voice, widget_parent +from calibre.gui2.tts.types import TTS_EMBEDED_CONFIG, EngineSpecificSettings, Quality, TTSBackend, Voice, widget_parent from calibre.spell.break_iterator import PARAGRAPH_SEPARATOR, split_into_sentences_for_tts from calibre.utils.localization import canonicalize_lang, get_lang from calibre.utils.resources import get_path as P @@ -46,6 +49,97 @@ def debug(*a, **kw): print(f'[{monotonic() - debug.first:.2f}]', *a, **kw) +def audio_format(audio_rate: int = 22050) -> QAudioFormat: + fmt = QAudioFormat() + fmt.setSampleFormat(QAudioFormat.SampleFormat.Int16) + fmt.setSampleRate(audio_rate) + fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono) + return fmt + + +def piper_process_metadata(model_path, config_path, s: EngineSpecificSettings, voice: Voice) -> tuple[int, list[str]]: + if not model_path: + raise Exception('Could not download voice data') + if 'metadata' not in voice.engine_data: + with open(config_path) as f: + voice.engine_data['metadata'] = json.load(f) + audio_rate = voice.engine_data['metadata']['audio']['sample_rate'] + length_scale = max(0.1, 1 + -1 * s.rate) # maps -1 to 1 to 2 to 0.1 + cmdline = list(piper_cmdline()) + [ + '--model', model_path, '--config', config_path, '--output-raw', '--json-input', + '--sentence-silence', str(s.sentence_delay), '--length_scale', str(length_scale)] + if is_debugging(): + cmdline.append('--debug') + return audio_rate, cmdline + + +def piper_cache_dir() -> str: + return os.path.join(cache_dir(), 'piper-voices') + + +def paths_for_voice(voice: Voice) -> tuple[str, str]: + fname = voice.engine_data['model_filename'] + model_path = os.path.join(piper_cache_dir(), fname) + config_path = os.path.join(os.path.dirname(model_path), fname + '.json') + return model_path, config_path + + +def load_voice_metadata() -> tuple[dict[str, Voice], tuple[Voice, ...], dict[str, Voice]]: + d = json.loads(P('piper-voices.json', data=True)) + ans = [] + lang_voices_map = {} + _voice_name_map = {} + downloaded = set() + with suppress(OSError): + downloaded = set(os.listdir(piper_cache_dir())) + for bcp_code, voice_map in d['lang_map'].items(): + lang, sep, country = bcp_code.partition('_') + lang = canonicalize_lang(lang) or lang + voices_for_lang = lang_voices_map.setdefault(lang, []) + for voice_name, qual_map in voice_map.items(): + best_qual = voice = None + for qual, e in qual_map.items(): + q = Quality.from_piper_quality(qual) + if best_qual is None or q.value < best_qual.value: + best_qual = q + mf = f'{bcp_code}-{voice_name}-{qual}.onnx' + voice = Voice(bcp_code + ':' + voice_name, lang, country, human_name=voice_name, quality=q, engine_data={ + 'model_url': e['model'], 'config_url': e['config'], + 'model_filename': mf, 'is_downloaded': mf in downloaded, + }) + if voice: + ans.append(voice) + _voice_name_map[voice.name] = voice + voices_for_lang.append(voice) + _voices = tuple(ans) + _voice_for_lang = {} + for lang, voices in lang_voices_map.items(): + voices.sort(key=lambda v: v.quality.value) + _voice_for_lang[lang] = voices[0] + if lang == 'eng': + for v in voices: + if v.human_name == 'libritts': + _voice_for_lang[lang] = v + break + return _voice_name_map, _voices, _voice_for_lang + + +def download_voice(voice: Voice, download_even_if_exists: bool = False, parent: QObject | None = None, headless: bool = False) -> tuple[str, str]: + model_path, config_path = paths_for_voice(voice) + if os.path.exists(model_path) and os.path.exists(config_path): + if not download_even_if_exists: + return model_path, config_path + os.makedirs(os.path.dirname(model_path), exist_ok=True) + from calibre.gui2.tts.download import download_resources + ok = download_resources(_('Downloading voice for Read aloud'), _('Downloading neural network for the {} voice').format(voice.human_name), { + voice.engine_data['model_url']: (model_path, _('Neural network data')), + voice.engine_data['config_url']: (config_path, _('Neural network metadata')), + }, parent=widget_parent(parent), headless=headless, + ) + voice.engine_data['is_downloaded'] = bool(ok) + return (model_path, config_path) if ok else ('', '') + + @dataclass class Utterance: id: int @@ -156,13 +250,34 @@ def split_into_utterances(text: str, counter: count, lang: str = 'en'): yield u +@lru_cache(2) +def stderr_pat(): + return re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)') + + +def detect_end_of_data(data: bytes, callback): + lines = data.split(b'\n') + for line in lines[:-1]: + if m := stderr_pat().search(line): + which, payload = m.group(1), m.group(2) + if which == b'info': + debug(f'[piper-info] {payload.decode("utf-8", "replace")}') + if payload.startswith(b'Real-time factor:'): + callback(True, None) + elif which == b'error': + callback(False, payload.decode('utf-8', 'replace')) + elif which == b'debug': + debug(f'[piper-debug] {payload.decode("utf-8", "replace")}') + return lines[-1] + + class Piper(TTSBackend): engine_name: str = 'piper' filler_char: str = PARAGRAPH_SEPARATOR _synthesis_done = pyqtSignal() - def __init__(self, engine_name: str = '', parent: QObject|None = None): + def __init__(self, engine_name: str = '', parent: QObject | None = None): super().__init__(parent) self._process: QProcess | None = None self._audio_sink: QAudioSink | None = None @@ -179,7 +294,6 @@ class Piper(TTSBackend): self._errors_from_piper: list[str] = [] self._pending_stderr_data = b'' - self._stderr_pat = re.compile(rb'\[piper\] \[([a-zA-Z0-9_]+?)\] (.+)') self._synthesis_done.connect(self._utterance_synthesized, type=Qt.ConnectionType.QueuedConnection) atexit.register(self.shutdown) @@ -188,16 +302,21 @@ class Piper(TTSBackend): self._load_voice_metadata() return {'': self._voices} - def say(self, text: str) -> None: - if self._last_error: - return - self.stop() + def _wait_for_process_to_start(self) -> bool: if not self.process.waitForStarted(): cmdline = [self.process.program()] + self.process.arguments() if self.process.error() is QProcess.ProcessError.TimedOut: self._set_error(f'Timed out waiting for piper process {cmdline} to start') else: self._set_error(f'Failed to start piper process: {cmdline}') + return False + return True + + def say(self, text: str) -> None: + if self._last_error: + return + self.stop() + if not self._wait_for_process_to_start(): return lang = 'en' if self._current_voice and self._current_voice.language_code: @@ -265,17 +384,10 @@ class Piper(TTSBackend): try: self._load_voice_metadata() s = EngineSpecificSettings.create_from_config(self.engine_name) - length_scale = max(0.1, 1 + -1 * s.rate) # maps -1 to 1 to 2 to 0.1 voice = self._voice_name_map.get(s.voice_name) or self._default_voice model_path, config_path = self._ensure_voice_is_downloaded(voice) except AttributeError as e: raise Exception(str(e)) from e - if not model_path: - raise Exception('Could not download voice data') - if 'metadata' not in voice.engine_data: - with open(config_path) as f: - voice.engine_data['metadata'] = json.load(f) - audio_rate = voice.engine_data['metadata']['audio']['sample_rate'] self._current_voice = voice self._utterances_being_spoken.clear() self._utterances_being_synthesized.clear() @@ -284,11 +396,7 @@ class Piper(TTSBackend): self._pending_stderr_data = b'' self._set_state(QTextToSpeech.State.Ready) - cmdline = list(piper_cmdline()) + [ - '--model', model_path, '--config', config_path, '--output-raw', '--json-input', - '--sentence-silence', str(s.sentence_delay), '--length_scale', str(length_scale)] - if is_debugging(): - cmdline.append('--debug') + audio_rate, cmdline = piper_process_metadata(model_path, config_path, s, voice) self._process.setProgram(cmdline[0]) self._process.setArguments(cmdline[1:]) debug('Running piper:', cmdline) @@ -296,10 +404,7 @@ class Piper(TTSBackend): self._process.readyReadStandardOutput.connect(self.piper_stdout_available) self._process.bytesWritten.connect(self.bytes_written) self._process.stateChanged.connect(self._update_status) - fmt = QAudioFormat() - fmt.setSampleFormat(QAudioFormat.SampleFormat.Int16) - fmt.setSampleRate(audio_rate) - fmt.setChannelConfig(QAudioFormat.ChannelConfig.ChannelConfigMono) + fmt = audio_format(audio_rate) dev = None if s.audio_device_id: for q in QMediaDevices.audioOutputs(): @@ -332,20 +437,14 @@ class Piper(TTSBackend): def piper_stderr_available(self) -> None: if self._process is not None: + def callback(ok, payload): + if ok: + if self._utterances_being_synthesized: + self._synthesis_done.emit() + else: + self._errors_from_piper.append(payload.decode('utf-8', 'replace')) data = self._pending_stderr_data + bytes(self._process.readAllStandardError()) - lines = data.split(b'\n') - for line in lines[:-1]: - if m := self._stderr_pat.search(line): - which, payload = m.group(1), m.group(2) - if which == b'info': - debug(f'[piper-info] {payload.decode("utf-8", "replace")}') - if payload.startswith(b'Real-time factor:') and self._utterances_being_synthesized: - self._synthesis_done.emit() - elif which == b'error': - self._errors_from_piper.append(payload.decode('utf-8', 'replace')) - elif which == b'debug': - debug(f'[piper-debug] {payload.decode("utf-8", "replace")}') - self._pending_stderr_data = lines[-1] + self._pending_stderr_data = detect_end_of_data(data, callback) def _utterance_synthesized(self): self.piper_stdout_available() # just in case @@ -402,42 +501,7 @@ class Piper(TTSBackend): def _load_voice_metadata(self) -> None: if self._voices is not None: return - d = json.loads(P('piper-voices.json', data=True)) - ans = [] - lang_voices_map = {} - self._voice_name_map = {} - downloaded = set() - with suppress(OSError): - downloaded = set(os.listdir(self.cache_dir)) - for bcp_code, voice_map in d['lang_map'].items(): - lang, sep, country = bcp_code.partition('_') - lang = canonicalize_lang(lang) or lang - voices_for_lang = lang_voices_map.setdefault(lang, []) - for voice_name, qual_map in voice_map.items(): - best_qual = voice = None - for qual, e in qual_map.items(): - q = Quality.from_piper_quality(qual) - if best_qual is None or q.value < best_qual.value: - best_qual = q - mf = f'{bcp_code}-{voice_name}-{qual}.onnx' - voice = Voice(bcp_code + ':' + voice_name, lang, country, human_name=voice_name, quality=q, engine_data={ - 'model_url': e['model'], 'config_url': e['config'], - 'model_filename': mf, 'is_downloaded': mf in downloaded, - }) - if voice: - ans.append(voice) - self._voice_name_map[voice.name] = voice - voices_for_lang.append(voice) - self._voices = tuple(ans) - self._voice_for_lang = {} - for lang, voices in lang_voices_map.items(): - voices.sort(key=lambda v: v.quality.value) - self._voice_for_lang[lang] = voices[0] - if lang == 'eng': - for v in voices: - if v.human_name == 'libritts': - self._voice_for_lang[lang] = v - break + self._voice_name_map, self._voices, self._voice_for_lang = load_voice_metadata() @property def _default_voice(self) -> Voice: @@ -448,18 +512,12 @@ class Piper(TTSBackend): @property def cache_dir(self) -> str: - return os.path.join(cache_dir(), 'piper-voices') - - def _paths_for_voice(self, voice: Voice) -> tuple[str, str]: - fname = voice.engine_data['model_filename'] - model_path = os.path.join(self.cache_dir, fname) - config_path = os.path.join(os.path.dirname(model_path), fname + '.json') - return model_path, config_path + return piper_cache_dir() def is_voice_downloaded(self, v: Voice) -> bool: if not v or not v.name: v = self._default_voice - for path in self._paths_for_voice(v): + for path in paths_for_voice(v): if not os.path.exists(path): return False return True @@ -467,25 +525,13 @@ class Piper(TTSBackend): def delete_voice(self, v: Voice) -> None: if not v.name: v = self._default_voice - for path in self._paths_for_voice(v): + for path in paths_for_voice(v): with suppress(FileNotFoundError): os.remove(path) v.engine_data['is_downloaded'] = False def _download_voice(self, voice: Voice, download_even_if_exists: bool = False) -> tuple[str, str]: - model_path, config_path = self._paths_for_voice(voice) - if os.path.exists(model_path) and os.path.exists(config_path): - if not download_even_if_exists: - return model_path, config_path - os.makedirs(os.path.dirname(model_path), exist_ok=True) - from calibre.gui2.tts.download import download_resources - ok = download_resources(_('Downloading voice for Read aloud'), _('Downloading neural network for the {} voice').format(voice.human_name), { - voice.engine_data['model_url']: (model_path, _('Neural network data')), - voice.engine_data['config_url']: (config_path, _('Neural network metadata')), - }, parent=widget_parent(self), headless=getattr(QApplication.instance(), 'headless', False) - ) - voice.engine_data['is_downloaded'] = bool(ok) - return (model_path, config_path) if ok else ('', '') + return download_voice(voice, download_even_if_exists, parent=self, headless=False) def download_voice(self, v: Voice) -> None: if not v.name: @@ -511,6 +557,122 @@ class Piper(TTSBackend): return True +class PiperEmbedded: + + def __init__(self): + self._embedded_settings = EngineSpecificSettings.create_from_config(self.engine_name, TTS_EMBEDED_CONFIG) + self._voice_name_map, self._voices, self._voice_for_lang = load_voice_metadata() + lang = get_lang() + lang = canonicalize_lang(lang) or lang + self._default_voice = self._voice_for_lang.get(lang) or self._voice_for_lang['eng'] + self._current_voice = self._process = self._process_shutdown_event = None + self._current_audio_format = QAudioFormat() + + def resolve_voice(self, lang: str, voice_name: str) -> Voice: + from calibre.utils.localization import canonicalize_lang, get_lang + lang = canonicalize_lang(lang or get_lang() or 'en') + if voice_name and voice_name in self._voice_name_map: + voice = self._voice_name_map[voice_name] + elif (voice_name := self._embedded_settings.preferred_voices.get(lang, '')) and voice_name in self._voice_name_map: + voice = self._voice_name_map[voice_name] + else: + voice = self._voice_for_lang.get(lang) or self._default_voice + return voice + + def text_to_raw_audio_data( + self, texts: Iterable[str], lang: str = '', voice_name: str = '', format: QAudioFormat | None = None, timeout: float = 10., + ) -> Iterator[bytes]: + if format is None: + format = audio_format() + voice = self.resolve_voice(lang, voice_name) + if voice is not self._current_voice: + self._current_voice = voice + self.shutdown() + self.ensure_process_started() + piper_done, errors_from_piper = [], [] + last_output_at = monotonic() + + def callback(ok, payload): + nonlocal last_output_at + if ok: + piper_done.append(True) + last_output_at = monotonic() + else: + errors_from_piper.append(payload.decode('utf-8', 'replace')) + + for text in texts: + if not text: + yield b'' + self._process.stdin.write(text.encode('utf-8', 'replace')) + stderr_data = b'' + buf, piper_done, errors_from_piper = [], [], [] + last_output_at = monotonic() + while not piper_done: + try: + is_stdout, exception, data = self._from_process_queue.get(True, 1.0) + except Empty: + if self._process.poll() is not None: + raise Exception(f'piper process died with error code: {self._process.poll()} and stderr: {"".join(errors_from_piper)}') + if monotonic() - last_output_at > timeout: + raise TimeoutError(f'piper process produced no output for {timeout} seconds. stderr: {"".join(errors_from_piper)}') + else: + if exception is not None: + raise exception + if is_stdout: + buf.append(data) + else: + stderr_data = detect_end_of_data(stderr_data + data, callback) + needs_conversion = format != self._current_audio_format + needs_conversion + + def ensure_voices_downloaded(self, specs: Iterable[tuple[str, str]], parent: QObject = None) -> None: + for lang, voice_name in specs: + voice = self.resolve_voice(lang, voice_name) + download_voice(voice, parent=parent, headless=parent is None) + + def shutdown(self): + if self._process is not None: + self._process_shutdown_event.set() + self._to_process_queue.put(None) + self._process_shutdown_event = None + self._process.stdin.close() + self._process.kill() + self._process.wait() + self._process.stdout.close() + self._process.stderr.close() + self._process = None + self._stdout_reader.join() + self._stderr_reader.join() + __del__ = shutdown + + def ensure_process_started(self): + if self._process is not None: + return + model_path, config_path = download_voice(self._current_voice, headless=True) + audio_rate, cmdline = piper_process_metadata(model_path, config_path, self._embedded_settings, self._current_voice) + self._current_audio_format = audio_format(audio_rate) + import subprocess + from threading import Thread + self._process_shutdown_event = Event() + self._stdout_reader = Thread(target=self.reader, args=(self._process_shutdown_event, self._process.stdout, True), daemon=True) + self._stderr_reader = Thread(target=self.reader, args=(self._process_shutdown_event, self._process.stderr, False), daemon=True) + self._from_process_queue = Queue() + self._process = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + self._stdout_reader.start() + self._stderr_reader.start() + + def reader(self, shutdown_event: Event, pipe: BinaryIO, is_stdout: bool = False) -> None: + while not shutdown_event.is_set(): + try: + data = pipe.read() + except Exception as e: + if not shutdown_event.is_set(): + self._from_process_queue.put((is_stdout, e, b'')) + break + else: + self._from_process_queue.put((is_stdout, None, data)) + + def develop(): # {{{ from qt.core import QSocketNotifier