Start work on speechd backend

This commit is contained in:
Kovid Goyal 2024-08-25 21:35:51 +05:30
parent 1b6465d2ac
commit 640193a52f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 185 additions and 57 deletions

View File

@ -1,13 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
import sys
from typing import NamedTuple from typing import NamedTuple
from qt.core import QMediaDevices, QObject, QTextToSpeech, pyqtSignal from qt.core import QMediaDevices, QObject, QTextToSpeech, pyqtSignal
from calibre.constants import islinux from calibre.gui2.tts2.types import EngineSpecificSettings, Voice, qvoice_to_voice
from calibre.gui2.tts2.types import EngineSpecificSettings
class Pos(NamedTuple): class Pos(NamedTuple):
@ -58,8 +56,15 @@ class QtTTSBackend(QObject):
def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None): def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
super().__init__(parent) super().__init__(parent)
self.tracker = Tracker() self.tracker = Tracker()
self._voices = None
self.apply_settings(engine_name, settings) self.apply_settings(engine_name, settings)
@property
def available_voices(self) -> dict[str, tuple[Voice, ...]]:
if self._voices is None:
self._voices = tuple(map(qvoice_to_voice, self.tts.availableVoices()))
return {'': self._voices}
def apply_settings(self, engine_name: str, settings: EngineSpecificSettings) -> None: def apply_settings(self, engine_name: str, settings: EngineSpecificSettings) -> None:
s = {} s = {}
if settings.audio_device_id: if settings.audio_device_id:
@ -108,53 +113,3 @@ class QtTTSBackend(QObject):
x = self.tracker.mark_word(start, length) x = self.tracker.mark_word(start, length)
if x is not None: if x is not None:
self.saying.emit(x[0], x[1]) self.saying.emit(x[0], x[1])
def develop():
# {{{
marked_text = [2, 'Demonstration', ' ', 16, 'of', ' ', 19, 'DOCX', ' ', 24, 'support', ' ', 32, 'in', ' ', 35, 'calibre', '\n\t', 44, 'This', ' ', 49, 'document', ' ', 58, 'demonstrates', ' ', 71, 'the', ' ', 75, 'ability', ' ', 83, 'of', ' ', 86, 'the', ' ', 90, 'calibre', ' ', 98, 'DOCX', ' ', 103, 'Input', ' ', 109, 'plugin', ' ', 116, 'to', ' ', 119, 'convert', ' ', 127, 'the', ' ', 131, 'various', ' ', 139, 'typographic', ' ', 151, 'features', ' ', 160, 'in', ' ', 163, 'a', ' ', 165, 'Microsoft', ' ', 175, 'Word', ' ', 180, '(2007', ' ', 186, 'and', ' ', 190, 'newer)', ' ', 197, 'document.', ' ', 207, 'Convert', ' ', 215, 'this', ' ', 220, 'document', ' ', 229, 'to', ' ', 232, 'a', ' ', 234, 'modern', ' ', 241, 'ebook', ' ', 247, 'format,', ' ', 255, 'such', ' ', 260, 'as', ' ', 263, 'AZW3', ' ', 268, 'for', ' ', 272, 'Kindles', ' ', 280, 'or', ' ', 283, 'EPUB', ' ', 288, 'for', ' ', 292, 'other', ' ', 298, 'ebook', ' ', 304, 'readers,', ' ', 313, 'to', ' ', 316, 'see', ' ', 320, 'it', ' ', 323, 'in', ' ', 326, 'action.', '\n\t', 335, 'There', ' ', 341, 'is', ' ', 344, 'support', ' ', 352, 'for', ' ', 356, 'images,', ' ', 364, 'tables,', ' ', 372, 'lists,', ' ', 379, 'footnotes,', ' ', 390, 'endnotes,', ' ', 400, 'links,', ' ', 407, 'dropcaps', ' ', 416, 'and', ' ', 420, 'various', ' ', 428, 'types', ' ', 434, 'of', ' ', 437, 'text', ' ', 442, 'and', ' ', 446, 'paragraph', ' ', 456, 'level', ' ', 462, 'formatting.', '\n\t', 475, 'To', ' ', 478, 'see', ' ', 482, 'the', ' ', 486, 'DOCX', ' ', 491, 'conversion', ' ', 502, 'in', ' ', 505, 'action,', ' ', 513, 'simply', ' ', 520, 'add', ' ', 524, 'this', ' ', 529, 'file', ' ', 534, 'to', ' ', 537, 'calibre', ' ', 545, 'using', ' ', 551, 'the', ' ', 555, '“Add', ' ', 560, 'Books”', ' ', 567, 'button', ' ', 574, 'and', ' ', 578, 'then', ' ', 583, 'click', ' ', 589, '“Convert”.', ' ', 601, 'Set', ' ', 605, 'the', ' ', 609, 'output', ' ', 616, 'format', ' ', 623, 'in', ' ', 626, 'the', ' ', 630, 'top', ' ', 634, 'right', ' ', 640, 'corner', ' ', 647, 'of', ' ', 650, 'the', ' ', 654, 'conversion', ' ', 665, 'dialog', ' ', 672, 'to', ' ', 675, 'EPUB', ' ', 680, 'or', ' ', 683, 'AZW3', ' ', 688, 'and', ' ', 692, 'click', ' ', 698, '“OK”.', '\n\t\xa0\n\t'] # noqa }}}
from calibre.gui2 import Application
app = Application([])
app.shutdown_signal_received.connect(lambda: app.exit(1))
engine_name = ''
if islinux:
engine_name = 'flite'
tts = QtTTSBackend(engine_name=engine_name)
speech_started = False
def print_saying(s, e):
bits = []
in_region = False
for x in marked_text:
if isinstance(x, int):
if in_region:
if x >= e:
break
else:
if x == s:
in_region = True
elif x > e:
break
elif in_region:
bits.append(x)
print('Saying:', repr(''.join(bits)))
def state_changed(state):
nonlocal speech_started
if state == QTextToSpeech.State.Speaking:
speech_started = True
elif state == QTextToSpeech.State.Error:
print(tts.error_message(), file=sys.stderr)
app.exit(1)
elif state == QTextToSpeech.State.Ready:
if speech_started:
app.quit()
tts.saying.connect(print_saying)
tts.state_changed.connect(state_changed)
tts.speak_marked_text(marked_text)
app.exec()
if __name__ == '__main__':
develop()

View File

@ -0,0 +1,107 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
from qt.core import QObject, QTextToSpeech, pyqtSignal
from speechd.client import DataMode, Priority, SpawnError, SSIPClient, SSIPCommunicationError
from calibre.gui2.tts2.types import EngineSpecificSettings, Voice
from calibre.utils.localization import canonicalize_lang
class SpeechdTTSBackend(QObject):
saying = pyqtSignal(int, int)
state_changed = pyqtSignal(QTextToSpeech.State)
def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
super().__init__(parent)
self._last_error = ''
self._state = QTextToSpeech.State.Ready
self._voices = None
self._system_default_output_module = None
self.ssip_client: SSIPClient | None = None
self.apply_settings(engine_name, settings)
@property
def available_voices(self) -> dict[str, tuple[Voice, ...]]:
if self._voices is None:
def v(x) -> Voice:
name, langcode, variant = x
return Voice(name, canonicalize_lang(langcode) or 'und', human_name=name, notes=variant)
if self._ensure_state():
ans = {}
try:
om = self.ssip_client.get_output_module()
for omq in self.ssip_client.list_output_modules():
self.ssip_client.set_output_module(omq)
ans[omq] = tuple(map(v, self.ssip_client.list_synthesis_voices()))
self.ssip_client.set_output_module(om)
self._voices = ans
except Exception as e:
self._set_error(str(e))
return self._voices or {}
def apply_settings(self, engine_name: str, settings: EngineSpecificSettings) -> None:
try:
self._apply_settings(settings)
except Exception as err:
self._set_error(str(err))
def _set_error(self, msg: str) -> None:
self._last_error = msg
self._set_state(QTextToSpeech.Error)
def _create_ssip_client(self) -> bool:
try:
self.ssip_client = SSIPClient('calibre')
self.ssip_client.set_priority(Priority.TEXT)
return True
except SSIPCommunicationError as err:
ex = err.additional_exception()
if isinstance(ex, SpawnError):
self._set_error(_('Could not find speech-dispatcher on your system. Please install it.'))
else:
self._set_error(str(err))
except SpawnError:
self._set_error(_('Could not find speech-dispatcher on your system. Please install it.'))
except Exception as err:
self._set_error(str(err))
return False
def _ensure_state(self) -> bool:
if self.ssip_client is None:
if not self.create_ssip_client():
return False
if self._system_default_output_module is None:
self._system_default_output_module = self.ssip_client.get_output_module()
if self._system_default_output_module == '(null)':
mods = self.ssip_client.list_output_modules()
if not mods:
self._last_error = _('Speech dispatcher on this system is not configured with any available voices. Install some voices first.')
return False
self._system_default_output_module = mods[0]
self._set_use_ssml(True)
def _set_use_ssml(self, on: bool) -> bool:
mode = DataMode.SSML if on else DataMode.TEXT
try:
self.ssip_client.set_data_mode(mode)
return True
except SSIPCommunicationError:
self.ssip_client.close()
self.ssip_client = None
self._set_error(_('Failed to set support for SSML to: {}').format(on))
def _apply_settings(self, settings: EngineSpecificSettings) -> bool:
if not self._ensure_state():
return False
self.ssip_client.set_pitch_range(int(max(-1, min(settings.pitch, 1)) * 100))
self.ssip_client.set_rate(int(max(-1, min(settings.rate, 1)) * 100))
if settings.volume is not None:
self.ssip_client.set_volume(-100 + int(max(0, min(settings.volume, 1)) * 200))
om = settings.output_module or self._system_default_output_module
self.ssip_client.set_output_module(om)
if settings.voice_name:
self.ssip_client.set_synthesis_voice(settings.voice_name)
return True

View File

@ -5,8 +5,9 @@ from enum import Enum, auto
from functools import lru_cache from functools import lru_cache
from typing import Literal, NamedTuple from typing import Literal, NamedTuple
from qt.core import QLocale, QTextToSpeech, QVoice from qt.core import QLocale, QObject, QTextToSpeech, QVoice
from calibre.constants import islinux
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
@ -21,6 +22,10 @@ class EngineMetadata(NamedTuple):
tracking_capability: TrackingCapability = TrackingCapability.NoTracking tracking_capability: TrackingCapability = TrackingCapability.NoTracking
allows_choosing_audio_device: bool = True allows_choosing_audio_device: bool = True
can_synthesize_audio_data: bool = True can_synthesize_audio_data: bool = True
has_multiple_output_modules: bool = False
can_change_rate: bool = True
can_change_pitch: bool = True
can_change_volume: bool = True
class Quality(Enum): class Quality(Enum):
@ -32,8 +37,8 @@ class Quality(Enum):
class Voice(NamedTuple): class Voice(NamedTuple):
name: str name: str
language_code: str language_code: str
country_code: str
country_code: str = ''
human_name: str = '' human_name: str = ''
notes: str = '' notes: str = ''
gender: QVoice.Gender = QVoice.Gender.Unknown gender: QVoice.Gender = QVoice.Gender.Unknown
@ -58,6 +63,7 @@ class EngineSpecificSettings(NamedTuple):
rate: float = 0 # -1 to 1 0 is normal speech rate: float = 0 # -1 to 1 0 is normal speech
pitch: float = 0 # -1 to 1 0 is normal speech pitch: float = 0 # -1 to 1 0 is normal speech
volume: float | None = None # 0 to 1, None is platform default volume volume: float | None = None # 0 to 1, None is platform default volume
output_module: str = ''
@ -86,6 +92,66 @@ def available_engines() -> dict[str, EngineMetadata]:
elif x == 'flite': elif x == 'flite':
ans[x] = qt_engine_metadata(x, True) ans[x] = qt_engine_metadata(x, True)
elif x == 'speechd': elif x == 'speechd':
# TODO: Replace this with our own speechd client that supports word tracking ans[x] = EngineMetadata(x, TrackingCapability.WordByWord, allows_choosing_audio_device=False, has_multiple_output_modules=True)
ans[x] = qt_engine_metadata(x)
return ans return ans
def create_tts_backend(engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
if engine_name == '' and islinux:
engine_name = 'speechd'
if engine_name not in available_engines():
engine_name = ''
if engine_name == 'speechd':
from calibre.gui2.tts2.speechd import SpeechdTTSBackend
return SpeechdTTSBackend(engine_name, settings, parent)
from calibre.gui2.tts2.qt import QtTTSBackend
return QtTTSBackend(engine_name, settings, parent)
def develop(engine_name=''):
# {{{
marked_text = [2, 'Demonstration', ' ', 16, 'of', ' ', 19, 'DOCX', ' ', 24, 'support', ' ', 32, 'in', ' ', 35, 'calibre', '\n\t', 44, 'This', ' ', 49, 'document', ' ', 58, 'demonstrates', ' ', 71, 'the', ' ', 75, 'ability', ' ', 83, 'of', ' ', 86, 'the', ' ', 90, 'calibre', ' ', 98, 'DOCX', ' ', 103, 'Input', ' ', 109, 'plugin', ' ', 116, 'to', ' ', 119, 'convert', ' ', 127, 'the', ' ', 131, 'various', ' ', 139, 'typographic', ' ', 151, 'features', ' ', 160, 'in', ' ', 163, 'a', ' ', 165, 'Microsoft', ' ', 175, 'Word', ' ', 180, '(2007', ' ', 186, 'and', ' ', 190, 'newer)', ' ', 197, 'document.', ' ', 207, 'Convert', ' ', 215, 'this', ' ', 220, 'document', ' ', 229, 'to', ' ', 232, 'a', ' ', 234, 'modern', ' ', 241, 'ebook', ' ', 247, 'format,', ' ', 255, 'such', ' ', 260, 'as', ' ', 263, 'AZW3', ' ', 268, 'for', ' ', 272, 'Kindles', ' ', 280, 'or', ' ', 283, 'EPUB', ' ', 288, 'for', ' ', 292, 'other', ' ', 298, 'ebook', ' ', 304, 'readers,', ' ', 313, 'to', ' ', 316, 'see', ' ', 320, 'it', ' ', 323, 'in', ' ', 326, 'action.', '\n\t', 335, 'There', ' ', 341, 'is', ' ', 344, 'support', ' ', 352, 'for', ' ', 356, 'images,', ' ', 364, 'tables,', ' ', 372, 'lists,', ' ', 379, 'footnotes,', ' ', 390, 'endnotes,', ' ', 400, 'links,', ' ', 407, 'dropcaps', ' ', 416, 'and', ' ', 420, 'various', ' ', 428, 'types', ' ', 434, 'of', ' ', 437, 'text', ' ', 442, 'and', ' ', 446, 'paragraph', ' ', 456, 'level', ' ', 462, 'formatting.', '\n\t', 475, 'To', ' ', 478, 'see', ' ', 482, 'the', ' ', 486, 'DOCX', ' ', 491, 'conversion', ' ', 502, 'in', ' ', 505, 'action,', ' ', 513, 'simply', ' ', 520, 'add', ' ', 524, 'this', ' ', 529, 'file', ' ', 534, 'to', ' ', 537, 'calibre', ' ', 545, 'using', ' ', 551, 'the', ' ', 555, '“Add', ' ', 560, 'Books”', ' ', 567, 'button', ' ', 574, 'and', ' ', 578, 'then', ' ', 583, 'click', ' ', 589, '“Convert”.', ' ', 601, 'Set', ' ', 605, 'the', ' ', 609, 'output', ' ', 616, 'format', ' ', 623, 'in', ' ', 626, 'the', ' ', 630, 'top', ' ', 634, 'right', ' ', 640, 'corner', ' ', 647, 'of', ' ', 650, 'the', ' ', 654, 'conversion', ' ', 665, 'dialog', ' ', 672, 'to', ' ', 675, 'EPUB', ' ', 680, 'or', ' ', 683, 'AZW3', ' ', 688, 'and', ' ', 692, 'click', ' ', 698, '“OK”.', '\n\t\xa0\n\t'] # noqa }}}
from calibre.gui2 import Application
app = Application([])
app.shutdown_signal_received.connect(lambda: app.exit(1))
tts = create_tts_backend(engine_name=engine_name)
speech_started = False
def print_saying(s, e):
bits = []
in_region = False
for x in marked_text:
if isinstance(x, int):
if in_region:
if x >= e:
break
else:
if x == s:
in_region = True
elif x > e:
break
elif in_region:
bits.append(x)
print('Saying:', repr(''.join(bits)))
import sys
def state_changed(state):
nonlocal speech_started
if state == QTextToSpeech.State.Speaking:
speech_started = True
elif state == QTextToSpeech.State.Error:
print(tts.error_message(), file=sys.stderr)
app.exit(1)
elif state == QTextToSpeech.State.Ready:
if speech_started:
app.quit()
tts.saying.connect(print_saying)
tts.state_changed.connect(state_changed)
tts.speak_marked_text(marked_text)
app.exec()
if __name__ == '__main__':
develop()