diff --git a/src/calibre/gui2/tts2/speechd.py b/src/calibre/gui2/tts2/speechd.py index 706579bbfd..422651cbb8 100644 --- a/src/calibre/gui2/tts2/speechd.py +++ b/src/calibre/gui2/tts2/speechd.py @@ -1,46 +1,70 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2024, Kovid Goyal -from qt.core import QObject, QTextToSpeech, pyqtSignal -from speechd.client import DataMode, Priority, SpawnError, SSIPClient, SSIPCommunicationError +from contextlib import suppress +from qt.core import QObject, Qt, QTextToSpeech, pyqtSignal +from speechd.client import CallbackType, DataMode, Priority, SpawnError, SSIPClient, SSIPCommunicationError + +from calibre import prepare_string_for_xml from calibre.gui2.tts2.types import EngineSpecificSettings, Voice from calibre.utils.localization import canonicalize_lang +MARK_TEMPLATE = '' + +def add_markup(text_parts, mark_template=MARK_TEMPLATE, escape_marked_text=prepare_string_for_xml, chunk_size=0): + buf = [] + size = 0 + for x in text_parts: + if isinstance(x, int): + item = mark_template.format(x) + else: + item = escape_marked_text(x) + sz = len(item) + if chunk_size and size + sz > chunk_size: + yield ''.join(buf).strip() + size = 0 + buf = [] + size += sz + buf.append(item) + if size: + yield ''.join(buf).strip() + + +def wrap_in_ssml(text): + return ('\n' + + text + '') + class SpeechdTTSBackend(QObject): saying = pyqtSignal(int, int) state_changed = pyqtSignal(QTextToSpeech.State) + _event_signal = pyqtSignal(object, object) + def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None): super().__init__(parent) self._last_error = '' self._state = QTextToSpeech.State.Ready self._voices = None self._system_default_output_module = None - self.ssip_client: SSIPClient | None = None + self._current_settings = EngineSpecificSettings() + self._status = {'synthesizing': False, 'paused': False} + self._next_begin_is_for_resume = False + self._ssip_client: SSIPClient | None = None + self._event_signal.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection) + self._current_marked_text = self._last_mark = None self.apply_settings(engine_name, settings) @property def available_voices(self) -> dict[str, tuple[Voice, ...]]: - if self._voices is None: - def v(x) -> Voice: - name, langcode, variant = x - return Voice(name, canonicalize_lang(langcode) or 'und', human_name=name, notes=variant) - - if self._ensure_state(): - ans = {} - try: - om = self.ssip_client.get_output_module() - for omq in self.ssip_client.list_output_modules(): - self.ssip_client.set_output_module(omq) - ans[omq] = tuple(map(v, self.ssip_client.list_synthesis_voices())) - self.ssip_client.set_output_module(om) - self._voices = ans - except Exception as e: - self._set_error(str(e)) - return self._voices or {} + if self._voices is None: + try: + self._voices = self._get_all_voices_for_all_output_modules() + except Exception as e: + self._set_error(str(e)) + return self._voices or {} def apply_settings(self, engine_name: str, settings: EngineSpecificSettings) -> None: try: @@ -48,14 +72,60 @@ class SpeechdTTSBackend(QObject): except Exception as err: self._set_error(str(err)) + def change_rate(self, steps: int = 1) -> bool: + current = self._current_settings.rate + new_rate = max(-1, min(current + 0.2 * steps, 1)) + if current == new_rate: + return False + try: + self._ssip_client.set_rate(int(max(-1, min(new_rate, 1)) * 100)) + except Exception as e: + self._set_error(str(e)) + return False + self._current_settings = self._current_settings._replace(rate=new_rate) + return True + + def stop(self) -> None: + self._current_marked_text = self._last_mark = None + self._next_cancel_is_for_pause = self._next_begin_is_for_resume = False + if self._ssip_client is not None: + try: + self._ssip_client.stop() + except Exception as e: + self._set_error(str(e)) + + def speak_simple_text(self, text: str) -> None: + self.stop() + self._current_marked_text = self._last_mark = None + self._speak(prepare_string_for_xml(text)) + + def speak_marked_text(self, marked_text: list[str | int]) -> None: + self.stop() + text = ''.join(add_markup(marked_text)) + self._current_marked_text = text + self._last_mark = None + self._speak(text) + + def __del__(self): + if self._ssip_client is not None: + with suppress(Exception): + self._ssip_client.cancel() + self._ssip_client.close() + self._ssip_client = None + shutdown = __del__ + + def _set_state(self, s: QTextToSpeech.State) -> None: + self._state = s + self.state_changed.emit(s) + def _set_error(self, msg: str) -> None: self._last_error = msg - self._set_state(QTextToSpeech.Error) + self._set_state(QTextToSpeech.State.Error) def _create_ssip_client(self) -> bool: try: - self.ssip_client = SSIPClient('calibre') - self.ssip_client.set_priority(Priority.TEXT) + self._ssip_client = SSIPClient('calibre') + self._ssip_client.set_priority(Priority.TEXT) return True except SSIPCommunicationError as err: ex = err.additional_exception() @@ -70,38 +140,85 @@ class SpeechdTTSBackend(QObject): return False def _ensure_state(self) -> bool: - if self.ssip_client is None: - if not self.create_ssip_client(): + if self._ssip_client is None: + if not self._create_ssip_client(): return False if self._system_default_output_module is None: - self._system_default_output_module = self.ssip_client.get_output_module() + self._system_default_output_module = self._ssip_client.get_output_module() if self._system_default_output_module == '(null)': - mods = self.ssip_client.list_output_modules() + mods = self._ssip_client.list_output_modules() if not mods: - self._last_error = _('Speech dispatcher on this system is not configured with any available voices. Install some voices first.') + self._set_error(_( + 'Speech dispatcher on this system is not configured with any available output modules. Install some output modules first.')) return False self._system_default_output_module = mods[0] - self._set_use_ssml(True) + return self._set_use_ssml(True) def _set_use_ssml(self, on: bool) -> bool: mode = DataMode.SSML if on else DataMode.TEXT try: - self.ssip_client.set_data_mode(mode) + self._ssip_client.set_data_mode(mode) return True except SSIPCommunicationError: - self.ssip_client.close() - self.ssip_client = None + self._ssip_client.close() + self._ssip_client = None self._set_error(_('Failed to set support for SSML to: {}').format(on)) + return False def _apply_settings(self, settings: EngineSpecificSettings) -> bool: if not self._ensure_state(): return False - self.ssip_client.set_pitch_range(int(max(-1, min(settings.pitch, 1)) * 100)) - self.ssip_client.set_rate(int(max(-1, min(settings.rate, 1)) * 100)) + self._ssip_client.set_pitch_range(int(max(-1, min(settings.pitch, 1)) * 100)) + self._ssip_client.set_rate(int(max(-1, min(settings.rate, 1)) * 100)) if settings.volume is not None: - self.ssip_client.set_volume(-100 + int(max(0, min(settings.volume, 1)) * 200)) + self._ssip_client.set_volume(-100 + int(max(0, min(settings.volume, 1)) * 200)) om = settings.output_module or self._system_default_output_module - self.ssip_client.set_output_module(om) + self._ssip_client.set_output_module(om) if settings.voice_name: - self.ssip_client.set_synthesis_voice(settings.voice_name) + self._ssip_client.set_synthesis_voice(settings.voice_name) + self._current_settings = settings return True + + def _get_all_voices_for_all_output_modules(self) -> dict[str, Voice]: + ans = {} + def v(x) -> Voice: + name, langcode, variant = x + return Voice(name, canonicalize_lang(langcode) or 'und', human_name=name, notes=variant) + + if self._ensure_state(): + om = self._ssip_client.get_output_module() + for omq in self._ssip_client.list_output_modules(): + self._ssip_client.set_output_module(omq) + ans[omq] = tuple(map(v, self._ssip_client.list_synthesis_voices())) + self._ssip_client.set_output_module(om) + return ans + + def _update_status(self, callback_type, index_mark=None): + event = None + if callback_type is CallbackType.INDEX_MARK: + mark = int(index_mark) + self._last_mark = mark + self.saying.emit(mark, mark) + elif callback_type is CallbackType.BEGIN: + self._status = {'synthesizing': True, 'paused': False} + self._set_state(QTextToSpeech.State.Speaking) + self._next_begin_is_for_resume = False + elif callback_type is CallbackType.END: + self._status = {'synthesizing': False, 'paused': False} + self._set_state(QTextToSpeech.State.Ready) + elif callback_type is CallbackType.CANCEL: + if self._next_cancel_is_for_pause: + self._status = {'synthesizing': True, 'paused': True} + self._set_state(QTextToSpeech.State.Paused) + else: + self._status = {'synthesizing': False, 'paused': False} + self._set_state(QTextToSpeech.State.Ready) + self._next_cancel_is_for_pause = False + return event + + def _speak_callback(self, callback_type: CallbackType, index_mark=None): + self._event_signal.emit(callback_type, index_mark) + + def _speak(self, text: str) -> None: + if self._ensure_state(): + self._ssip_client.speak(wrap_in_ssml(text), self._speak_callback) diff --git a/src/calibre/gui2/tts2/types.py b/src/calibre/gui2/tts2/types.py index 6e4dac45cd..807b31cfed 100644 --- a/src/calibre/gui2/tts2/types.py +++ b/src/calibre/gui2/tts2/types.py @@ -8,7 +8,8 @@ from typing import Literal, NamedTuple from qt.core import QLocale, QObject, QTextToSpeech, QVoice -from calibre.constants import islinux +from calibre.constants import islinux, iswindows +from calibre.utils.config_base import tweaks from calibre.utils.localization import canonicalize_lang @@ -74,10 +75,10 @@ def available_engines() -> dict[str, EngineMetadata]: def qt_engine_metadata(name: str, allows_choosing_audio_device: bool = False) -> EngineMetadata: e.setEngine(name) - cap = e.engineCapabilities() - return EngineMetadata( - name, TrackingCapability.WordByWord if cap & QTextToSpeech.Capability.WordByWordProgress else TrackingCapability.NoTracking, - allows_choosing_audio_device, cap & QTextToSpeech.Capability.Synthesize) + cap = int(e.engineCapabilities().value) + return EngineMetadata(name, + TrackingCapability.WordByWord if cap & int(QTextToSpeech.Capability.WordByWordProgress.value) else TrackingCapability.NoTracking, + allows_choosing_audio_device, bool(cap & int(QTextToSpeech.Capability.Synthesize.value))) for x in QTextToSpeech.availableEngines(): if x == 'winrt': @@ -102,8 +103,11 @@ def available_engines() -> dict[str, EngineMetadata]: def create_tts_backend(engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None): - if engine_name == '' and islinux: - engine_name = 'speechd' + if engine_name == '': + if iswindows and tweaks.get('prefer_winsapi'): + engine_name = 'sapi' + elif islinux: + engine_name = 'speechd' if engine_name not in available_engines(): engine_name = '' @@ -147,6 +151,7 @@ def develop(engine_name=''): def state_changed(state): nonlocal speech_started + print('State changed:', state) if state == QTextToSpeech.State.Speaking: speech_started = True elif state == QTextToSpeech.State.Error: