Basic functinality implemented in speechd backend

This commit is contained in:
Kovid Goyal 2024-08-27 15:32:04 +05:30
parent fd0c64bbc6
commit 16f7ddb416
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 166 additions and 44 deletions

View File

@ -1,43 +1,67 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
from qt.core import QObject, QTextToSpeech, pyqtSignal from contextlib import suppress
from speechd.client import DataMode, Priority, SpawnError, SSIPClient, SSIPCommunicationError
from qt.core import QObject, Qt, QTextToSpeech, pyqtSignal
from speechd.client import CallbackType, DataMode, Priority, SpawnError, SSIPClient, SSIPCommunicationError
from calibre import prepare_string_for_xml
from calibre.gui2.tts2.types import EngineSpecificSettings, Voice from calibre.gui2.tts2.types import EngineSpecificSettings, Voice
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
MARK_TEMPLATE = '<mark name="{}"/>'
def add_markup(text_parts, mark_template=MARK_TEMPLATE, escape_marked_text=prepare_string_for_xml, chunk_size=0):
buf = []
size = 0
for x in text_parts:
if isinstance(x, int):
item = mark_template.format(x)
else:
item = escape_marked_text(x)
sz = len(item)
if chunk_size and size + sz > chunk_size:
yield ''.join(buf).strip()
size = 0
buf = []
size += sz
buf.append(item)
if size:
yield ''.join(buf).strip()
def wrap_in_ssml(text):
return ('<?xml version="1.0"?>\n<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"><s>' +
text + '</s></speak>')
class SpeechdTTSBackend(QObject): class SpeechdTTSBackend(QObject):
saying = pyqtSignal(int, int) saying = pyqtSignal(int, int)
state_changed = pyqtSignal(QTextToSpeech.State) state_changed = pyqtSignal(QTextToSpeech.State)
_event_signal = pyqtSignal(object, object)
def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None): def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
super().__init__(parent) super().__init__(parent)
self._last_error = '' self._last_error = ''
self._state = QTextToSpeech.State.Ready self._state = QTextToSpeech.State.Ready
self._voices = None self._voices = None
self._system_default_output_module = None self._system_default_output_module = None
self.ssip_client: SSIPClient | None = None self._current_settings = EngineSpecificSettings()
self._status = {'synthesizing': False, 'paused': False}
self._next_begin_is_for_resume = False
self._ssip_client: SSIPClient | None = None
self._event_signal.connect(self._update_status, type=Qt.ConnectionType.QueuedConnection)
self._current_marked_text = self._last_mark = None
self.apply_settings(engine_name, settings) self.apply_settings(engine_name, settings)
@property @property
def available_voices(self) -> dict[str, tuple[Voice, ...]]: def available_voices(self) -> dict[str, tuple[Voice, ...]]:
if self._voices is None: if self._voices is None:
def v(x) -> Voice:
name, langcode, variant = x
return Voice(name, canonicalize_lang(langcode) or 'und', human_name=name, notes=variant)
if self._ensure_state():
ans = {}
try: try:
om = self.ssip_client.get_output_module() self._voices = self._get_all_voices_for_all_output_modules()
for omq in self.ssip_client.list_output_modules():
self.ssip_client.set_output_module(omq)
ans[omq] = tuple(map(v, self.ssip_client.list_synthesis_voices()))
self.ssip_client.set_output_module(om)
self._voices = ans
except Exception as e: except Exception as e:
self._set_error(str(e)) self._set_error(str(e))
return self._voices or {} return self._voices or {}
@ -48,14 +72,60 @@ class SpeechdTTSBackend(QObject):
except Exception as err: except Exception as err:
self._set_error(str(err)) self._set_error(str(err))
def change_rate(self, steps: int = 1) -> bool:
current = self._current_settings.rate
new_rate = max(-1, min(current + 0.2 * steps, 1))
if current == new_rate:
return False
try:
self._ssip_client.set_rate(int(max(-1, min(new_rate, 1)) * 100))
except Exception as e:
self._set_error(str(e))
return False
self._current_settings = self._current_settings._replace(rate=new_rate)
return True
def stop(self) -> None:
self._current_marked_text = self._last_mark = None
self._next_cancel_is_for_pause = self._next_begin_is_for_resume = False
if self._ssip_client is not None:
try:
self._ssip_client.stop()
except Exception as e:
self._set_error(str(e))
def speak_simple_text(self, text: str) -> None:
self.stop()
self._current_marked_text = self._last_mark = None
self._speak(prepare_string_for_xml(text))
def speak_marked_text(self, marked_text: list[str | int]) -> None:
self.stop()
text = ''.join(add_markup(marked_text))
self._current_marked_text = text
self._last_mark = None
self._speak(text)
def __del__(self):
if self._ssip_client is not None:
with suppress(Exception):
self._ssip_client.cancel()
self._ssip_client.close()
self._ssip_client = None
shutdown = __del__
def _set_state(self, s: QTextToSpeech.State) -> None:
self._state = s
self.state_changed.emit(s)
def _set_error(self, msg: str) -> None: def _set_error(self, msg: str) -> None:
self._last_error = msg self._last_error = msg
self._set_state(QTextToSpeech.Error) self._set_state(QTextToSpeech.State.Error)
def _create_ssip_client(self) -> bool: def _create_ssip_client(self) -> bool:
try: try:
self.ssip_client = SSIPClient('calibre') self._ssip_client = SSIPClient('calibre')
self.ssip_client.set_priority(Priority.TEXT) self._ssip_client.set_priority(Priority.TEXT)
return True return True
except SSIPCommunicationError as err: except SSIPCommunicationError as err:
ex = err.additional_exception() ex = err.additional_exception()
@ -70,38 +140,85 @@ class SpeechdTTSBackend(QObject):
return False return False
def _ensure_state(self) -> bool: def _ensure_state(self) -> bool:
if self.ssip_client is None: if self._ssip_client is None:
if not self.create_ssip_client(): if not self._create_ssip_client():
return False return False
if self._system_default_output_module is None: if self._system_default_output_module is None:
self._system_default_output_module = self.ssip_client.get_output_module() self._system_default_output_module = self._ssip_client.get_output_module()
if self._system_default_output_module == '(null)': if self._system_default_output_module == '(null)':
mods = self.ssip_client.list_output_modules() mods = self._ssip_client.list_output_modules()
if not mods: if not mods:
self._last_error = _('Speech dispatcher on this system is not configured with any available voices. Install some voices first.') self._set_error(_(
'Speech dispatcher on this system is not configured with any available output modules. Install some output modules first.'))
return False return False
self._system_default_output_module = mods[0] self._system_default_output_module = mods[0]
self._set_use_ssml(True) return self._set_use_ssml(True)
def _set_use_ssml(self, on: bool) -> bool: def _set_use_ssml(self, on: bool) -> bool:
mode = DataMode.SSML if on else DataMode.TEXT mode = DataMode.SSML if on else DataMode.TEXT
try: try:
self.ssip_client.set_data_mode(mode) self._ssip_client.set_data_mode(mode)
return True return True
except SSIPCommunicationError: except SSIPCommunicationError:
self.ssip_client.close() self._ssip_client.close()
self.ssip_client = None self._ssip_client = None
self._set_error(_('Failed to set support for SSML to: {}').format(on)) self._set_error(_('Failed to set support for SSML to: {}').format(on))
return False
def _apply_settings(self, settings: EngineSpecificSettings) -> bool: def _apply_settings(self, settings: EngineSpecificSettings) -> bool:
if not self._ensure_state(): if not self._ensure_state():
return False return False
self.ssip_client.set_pitch_range(int(max(-1, min(settings.pitch, 1)) * 100)) self._ssip_client.set_pitch_range(int(max(-1, min(settings.pitch, 1)) * 100))
self.ssip_client.set_rate(int(max(-1, min(settings.rate, 1)) * 100)) self._ssip_client.set_rate(int(max(-1, min(settings.rate, 1)) * 100))
if settings.volume is not None: if settings.volume is not None:
self.ssip_client.set_volume(-100 + int(max(0, min(settings.volume, 1)) * 200)) self._ssip_client.set_volume(-100 + int(max(0, min(settings.volume, 1)) * 200))
om = settings.output_module or self._system_default_output_module om = settings.output_module or self._system_default_output_module
self.ssip_client.set_output_module(om) self._ssip_client.set_output_module(om)
if settings.voice_name: if settings.voice_name:
self.ssip_client.set_synthesis_voice(settings.voice_name) self._ssip_client.set_synthesis_voice(settings.voice_name)
self._current_settings = settings
return True return True
def _get_all_voices_for_all_output_modules(self) -> dict[str, Voice]:
ans = {}
def v(x) -> Voice:
name, langcode, variant = x
return Voice(name, canonicalize_lang(langcode) or 'und', human_name=name, notes=variant)
if self._ensure_state():
om = self._ssip_client.get_output_module()
for omq in self._ssip_client.list_output_modules():
self._ssip_client.set_output_module(omq)
ans[omq] = tuple(map(v, self._ssip_client.list_synthesis_voices()))
self._ssip_client.set_output_module(om)
return ans
def _update_status(self, callback_type, index_mark=None):
event = None
if callback_type is CallbackType.INDEX_MARK:
mark = int(index_mark)
self._last_mark = mark
self.saying.emit(mark, mark)
elif callback_type is CallbackType.BEGIN:
self._status = {'synthesizing': True, 'paused': False}
self._set_state(QTextToSpeech.State.Speaking)
self._next_begin_is_for_resume = False
elif callback_type is CallbackType.END:
self._status = {'synthesizing': False, 'paused': False}
self._set_state(QTextToSpeech.State.Ready)
elif callback_type is CallbackType.CANCEL:
if self._next_cancel_is_for_pause:
self._status = {'synthesizing': True, 'paused': True}
self._set_state(QTextToSpeech.State.Paused)
else:
self._status = {'synthesizing': False, 'paused': False}
self._set_state(QTextToSpeech.State.Ready)
self._next_cancel_is_for_pause = False
return event
def _speak_callback(self, callback_type: CallbackType, index_mark=None):
self._event_signal.emit(callback_type, index_mark)
def _speak(self, text: str) -> None:
if self._ensure_state():
self._ssip_client.speak(wrap_in_ssml(text), self._speak_callback)

View File

@ -8,7 +8,8 @@ from typing import Literal, NamedTuple
from qt.core import QLocale, QObject, QTextToSpeech, QVoice from qt.core import QLocale, QObject, QTextToSpeech, QVoice
from calibre.constants import islinux from calibre.constants import islinux, iswindows
from calibre.utils.config_base import tweaks
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
@ -74,10 +75,10 @@ def available_engines() -> dict[str, EngineMetadata]:
def qt_engine_metadata(name: str, allows_choosing_audio_device: bool = False) -> EngineMetadata: def qt_engine_metadata(name: str, allows_choosing_audio_device: bool = False) -> EngineMetadata:
e.setEngine(name) e.setEngine(name)
cap = e.engineCapabilities() cap = int(e.engineCapabilities().value)
return EngineMetadata( return EngineMetadata(name,
name, TrackingCapability.WordByWord if cap & QTextToSpeech.Capability.WordByWordProgress else TrackingCapability.NoTracking, TrackingCapability.WordByWord if cap & int(QTextToSpeech.Capability.WordByWordProgress.value) else TrackingCapability.NoTracking,
allows_choosing_audio_device, cap & QTextToSpeech.Capability.Synthesize) allows_choosing_audio_device, bool(cap & int(QTextToSpeech.Capability.Synthesize.value)))
for x in QTextToSpeech.availableEngines(): for x in QTextToSpeech.availableEngines():
if x == 'winrt': if x == 'winrt':
@ -102,7 +103,10 @@ def available_engines() -> dict[str, EngineMetadata]:
def create_tts_backend(engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None): def create_tts_backend(engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
if engine_name == '' and islinux: if engine_name == '':
if iswindows and tweaks.get('prefer_winsapi'):
engine_name = 'sapi'
elif islinux:
engine_name = 'speechd' engine_name = 'speechd'
if engine_name not in available_engines(): if engine_name not in available_engines():
engine_name = '' engine_name = ''
@ -147,6 +151,7 @@ def develop(engine_name=''):
def state_changed(state): def state_changed(state):
nonlocal speech_started nonlocal speech_started
print('State changed:', state)
if state == QTextToSpeech.State.Speaking: if state == QTextToSpeech.State.Speaking:
speech_started = True speech_started = True
elif state == QTextToSpeech.State.Error: elif state == QTextToSpeech.State.Error: