From 640193a52fa05cdefc667dd6be41f06e2eb3a10f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 25 Aug 2024 21:35:51 +0530
Subject: [PATCH] Start work on speechd backend

---
 src/calibre/gui2/tts2/qt.py      |  61 +++---------------
 src/calibre/gui2/tts2/speechd.py | 107 +++++++++++++++++++++++++++++++
 src/calibre/gui2/tts2/types.py   |  74 +++++++++++++++++++--
 3 files changed, 185 insertions(+), 57 deletions(-)
 create mode 100644 src/calibre/gui2/tts2/speechd.py
diff --git a/src/calibre/gui2/tts2/qt.py b/src/calibre/gui2/tts2/qt.py
index 3a41c99081..6f55a0e91d 100644
--- a/src/calibre/gui2/tts2/qt.py
+++ b/src/calibre/gui2/tts2/qt.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
 
-import sys
 from typing import NamedTuple
 
 from qt.core import QMediaDevices, QObject, QTextToSpeech, pyqtSignal
 
-from calibre.constants import islinux
-from calibre.gui2.tts2.types import EngineSpecificSettings
+from calibre.gui2.tts2.types import EngineSpecificSettings, Voice, qvoice_to_voice
 
 
 class Pos(NamedTuple):
@@ -58,8 +56,15 @@ class QtTTSBackend(QObject):
     def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
         super().__init__(parent)
         self.tracker = Tracker()
+        self._voices = None
         self.apply_settings(engine_name, settings)
 
+    @property
+    def available_voices(self) -> dict[str, tuple[Voice, ...]]:
+        if self._voices is None:
+            self._voices = tuple(map(qvoice_to_voice, self.tts.availableVoices()))
+        return {'': self._voices}
+
     def apply_settings(self, engine_name: str, settings: EngineSpecificSettings) -> None:
         s = {}
         if settings.audio_device_id:
@@ -108,53 +113,3 @@ class QtTTSBackend(QObject):
         x = self.tracker.mark_word(start, length)
         if x is not None:
             self.saying.emit(x[0], x[1])
-
-
-def develop():
-    # {{{
-    marked_text = [2, 'Demonstration', ' ', 16, 'of', ' ', 19, 'DOCX', ' ', 24, 'support', ' ', 32, 'in', ' ', 35, 'calibre', '\n\t', 44, 'This', ' ', 49, 'document', ' ', 58, 'demonstrates', ' ', 71, 'the', ' ', 75, 'ability', ' ', 83, 'of', ' ', 86, 'the', ' ', 90, 'calibre', ' ', 98, 'DOCX', ' ', 103, 'Input', ' ', 109, 'plugin', ' ', 116, 'to', ' ', 119, 'convert', ' ', 127, 'the', ' ', 131, 'various', ' ', 139, 'typographic', ' ', 151, 'features', ' ', 160, 'in', ' ', 163, 'a', ' ', 165, 'Microsoft', ' ', 175, 'Word', ' ', 180, '(2007', ' ', 186, 'and', ' ', 190, 'newer)', ' ', 197, 'document.', ' ', 207, 'Convert', ' ', 215, 'this', ' ', 220, 'document', ' ', 229, 'to', ' ', 232, 'a', ' ', 234, 'modern', ' ', 241, 'ebook', ' ', 247, 'format,', ' ', 255, 'such', ' ', 260, 'as', ' ', 263, 'AZW3', ' ', 268, 'for', ' ', 272, 'Kindles', ' ', 280, 'or', ' ', 283, 'EPUB', ' ', 288, 'for', ' ', 292, 'other', ' ', 298, 'ebook', ' ', 304, 'readers,', ' ', 313, 'to', ' ', 316, 'see', ' ', 320, 'it', ' ', 323, 'in', ' ', 326, 'action.', '\n\t', 335, 'There', ' ', 341, 'is', ' ', 344, 'support', ' ', 352, 'for', ' ', 356, 'images,', ' ', 364, 'tables,', ' ', 372, 'lists,', ' ', 379, 'footnotes,', ' ', 390, 'endnotes,', ' ', 400, 'links,', ' ', 407, 'dropcaps', ' ', 416, 'and', ' ', 420, 'various', ' ', 428, 'types', ' ', 434, 'of', ' ', 437, 'text', ' ', 442, 'and', ' ', 446, 'paragraph', ' ', 456, 'level', ' ', 462, 'formatting.', '\n\t', 475, 'To', ' ', 478, 'see', ' ', 482, 'the', ' ', 486, 'DOCX', ' ', 491, 'conversion', ' ', 502, 'in', ' ', 505, 'action,', ' ', 513, 'simply', ' ', 520, 'add', ' ', 524, 'this', ' ', 529, 'file', ' ', 534, 'to', ' ', 537, 'calibre', ' ', 545, 'using', ' ', 551, 'the', ' ', 555, '“Add', ' ', 560, 'Books”', ' ', 567, 'button', ' ', 574, 'and', ' ', 578, 'then', ' ', 583, 'click', ' ', 589, '“Convert”.', '  ', 601, 'Set', ' ', 605, 'the', ' ', 609, 'output', ' ', 616, 'format', ' ', 623, 'in', ' ', 626, 'the', ' ', 630, 'top', ' ', 634, 'right', ' ', 640, 'corner', ' ', 647, 'of', ' ', 650, 'the', ' ', 654, 'conversion', ' ', 665, 'dialog', ' ', 672, 'to', ' ', 675, 'EPUB', ' ', 680, 'or', ' ', 683, 'AZW3', ' ', 688, 'and', ' ', 692, 'click', ' ', 698, '“OK”.', '\n\t\xa0\n\t']  # noqa }}}
-
-    from calibre.gui2 import Application
-    app = Application([])
-    app.shutdown_signal_received.connect(lambda: app.exit(1))
-    engine_name = ''
-    if islinux:
-        engine_name = 'flite'
-    tts = QtTTSBackend(engine_name=engine_name)
-    speech_started = False
-
-    def print_saying(s, e):
-        bits = []
-        in_region = False
-        for x in marked_text:
-            if isinstance(x, int):
-                if in_region:
-                    if x >= e:
-                        break
-                else:
-                    if x == s:
-                        in_region = True
-                    elif x > e:
-                        break
-            elif in_region:
-                bits.append(x)
-        print('Saying:', repr(''.join(bits)))
-
-    def state_changed(state):
-        nonlocal speech_started
-        if state == QTextToSpeech.State.Speaking:
-            speech_started = True
-        elif state == QTextToSpeech.State.Error:
-            print(tts.error_message(), file=sys.stderr)
-            app.exit(1)
-        elif state == QTextToSpeech.State.Ready:
-            if speech_started:
-                app.quit()
-    tts.saying.connect(print_saying)
-    tts.state_changed.connect(state_changed)
-    tts.speak_marked_text(marked_text)
-    app.exec()
-
-
-if __name__ == '__main__':
-    develop()
diff --git a/src/calibre/gui2/tts2/speechd.py b/src/calibre/gui2/tts2/speechd.py
new file mode 100644
index 0000000000..706579bbfd
--- /dev/null
+++ b/src/calibre/gui2/tts2/speechd.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
+
+from qt.core import QObject, QTextToSpeech, pyqtSignal
+from speechd.client import DataMode, Priority, SpawnError, SSIPClient, SSIPCommunicationError
+
+from calibre.gui2.tts2.types import EngineSpecificSettings, Voice
+from calibre.utils.localization import canonicalize_lang
+
+
+class SpeechdTTSBackend(QObject):
+
+    saying = pyqtSignal(int, int)
+    state_changed = pyqtSignal(QTextToSpeech.State)
+
+    def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
+        super().__init__(parent)
+        self._last_error = ''
+        self._state = QTextToSpeech.State.Ready
+        self._voices = None
+        self._system_default_output_module = None
+        self.ssip_client: SSIPClient | None = None
+        self.apply_settings(engine_name, settings)
+
+    @property
+    def available_voices(self) -> dict[str, tuple[Voice, ...]]:
+        if self._voices is None:
+            def v(x) -> Voice:
+                name, langcode, variant = x
+                return Voice(name, canonicalize_lang(langcode) or 'und', human_name=name, notes=variant)
+
+            if self._ensure_state():
+                ans = {}
+                try:
+                    om = self.ssip_client.get_output_module()
+                    for omq in self.ssip_client.list_output_modules():
+                        self.ssip_client.set_output_module(omq)
+                        ans[omq] = tuple(map(v, self.ssip_client.list_synthesis_voices()))
+                    self.ssip_client.set_output_module(om)
+                    self._voices = ans
+                except Exception as e:
+                    self._set_error(str(e))
+        return self._voices or {}
+
+    def apply_settings(self, engine_name: str, settings: EngineSpecificSettings) -> None:
+        try:
+            self._apply_settings(settings)
+        except Exception as err:
+            self._set_error(str(err))
+
+    def _set_error(self, msg: str) -> None:
+        self._last_error = msg
+        self._set_state(QTextToSpeech.Error)
+
+    def _create_ssip_client(self) -> bool:
+        try:
+            self.ssip_client = SSIPClient('calibre')
+            self.ssip_client.set_priority(Priority.TEXT)
+            return True
+        except SSIPCommunicationError as err:
+            ex = err.additional_exception()
+            if isinstance(ex, SpawnError):
+                self._set_error(_('Could not find speech-dispatcher on your system. Please install it.'))
+            else:
+                self._set_error(str(err))
+        except SpawnError:
+            self._set_error(_('Could not find speech-dispatcher on your system. Please install it.'))
+        except Exception as err:
+            self._set_error(str(err))
+        return False
+
+    def _ensure_state(self) -> bool:
+        if self.ssip_client is None:
+            if not self.create_ssip_client():
+                return False
+        if self._system_default_output_module is None:
+            self._system_default_output_module = self.ssip_client.get_output_module()
+            if self._system_default_output_module == '(null)':
+                mods = self.ssip_client.list_output_modules()
+                if not mods:
+                    self._last_error = _('Speech dispatcher on this system is not configured with any available voices. Install some voices first.')
+                    return False
+                self._system_default_output_module = mods[0]
+        self._set_use_ssml(True)
+
+    def _set_use_ssml(self, on: bool) -> bool:
+        mode = DataMode.SSML if on else DataMode.TEXT
+        try:
+            self.ssip_client.set_data_mode(mode)
+            return True
+        except SSIPCommunicationError:
+            self.ssip_client.close()
+            self.ssip_client = None
+            self._set_error(_('Failed to set support for SSML to: {}').format(on))
+
+    def _apply_settings(self, settings: EngineSpecificSettings) -> bool:
+        if not self._ensure_state():
+            return False
+        self.ssip_client.set_pitch_range(int(max(-1, min(settings.pitch, 1)) * 100))
+        self.ssip_client.set_rate(int(max(-1, min(settings.rate, 1)) * 100))
+        if settings.volume is not None:
+            self.ssip_client.set_volume(-100 + int(max(0, min(settings.volume, 1)) * 200))
+        om = settings.output_module or self._system_default_output_module
+        self.ssip_client.set_output_module(om)
+        if settings.voice_name:
+            self.ssip_client.set_synthesis_voice(settings.voice_name)
+        return True
diff --git a/src/calibre/gui2/tts2/types.py b/src/calibre/gui2/tts2/types.py
index 741eaeafe2..904a77b41a 100644
--- a/src/calibre/gui2/tts2/types.py
+++ b/src/calibre/gui2/tts2/types.py
@@ -5,8 +5,9 @@ from enum import Enum, auto
 from functools import lru_cache
 from typing import Literal, NamedTuple
 
-from qt.core import QLocale, QTextToSpeech, QVoice
+from qt.core import QLocale, QObject, QTextToSpeech, QVoice
 
+from calibre.constants import islinux
 from calibre.utils.localization import canonicalize_lang
 
 
@@ -21,6 +22,10 @@ class EngineMetadata(NamedTuple):
     tracking_capability: TrackingCapability = TrackingCapability.NoTracking
     allows_choosing_audio_device: bool = True
     can_synthesize_audio_data: bool = True
+    has_multiple_output_modules: bool = False
+    can_change_rate: bool = True
+    can_change_pitch: bool = True
+    can_change_volume: bool = True
 
 
 class Quality(Enum):
@@ -32,8 +37,8 @@ class Quality(Enum):
 class Voice(NamedTuple):
     name: str
     language_code: str
-    country_code: str
 
+    country_code: str = ''
     human_name: str = ''
     notes: str = ''
     gender: QVoice.Gender = QVoice.Gender.Unknown
@@ -58,6 +63,7 @@ class EngineSpecificSettings(NamedTuple):
     rate: float = 0  # -1 to 1 0 is normal speech
     pitch: float = 0  # -1 to 1 0 is normal speech
     volume: float | None = None  # 0 to 1, None is platform default volume
+    output_module: str = ''
 
 
 
@@ -86,6 +92,66 @@ def available_engines() -> dict[str, EngineMetadata]:
         elif x == 'flite':
             ans[x] = qt_engine_metadata(x, True)
         elif x == 'speechd':
-            # TODO: Replace this with our own speechd client that supports word tracking
-            ans[x] = qt_engine_metadata(x)
+            ans[x] = EngineMetadata(x, TrackingCapability.WordByWord, allows_choosing_audio_device=False, has_multiple_output_modules=True)
     return ans
+
+
+def create_tts_backend(engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
+    if engine_name == '' and islinux:
+        engine_name = 'speechd'
+    if engine_name not in available_engines():
+        engine_name = ''
+    if engine_name == 'speechd':
+        from calibre.gui2.tts2.speechd import SpeechdTTSBackend
+        return SpeechdTTSBackend(engine_name, settings, parent)
+    from calibre.gui2.tts2.qt import QtTTSBackend
+    return QtTTSBackend(engine_name, settings, parent)
+
+
+def develop(engine_name=''):
+    # {{{
+    marked_text = [2, 'Demonstration', ' ', 16, 'of', ' ', 19, 'DOCX', ' ', 24, 'support', ' ', 32, 'in', ' ', 35, 'calibre', '\n\t', 44, 'This', ' ', 49, 'document', ' ', 58, 'demonstrates', ' ', 71, 'the', ' ', 75, 'ability', ' ', 83, 'of', ' ', 86, 'the', ' ', 90, 'calibre', ' ', 98, 'DOCX', ' ', 103, 'Input', ' ', 109, 'plugin', ' ', 116, 'to', ' ', 119, 'convert', ' ', 127, 'the', ' ', 131, 'various', ' ', 139, 'typographic', ' ', 151, 'features', ' ', 160, 'in', ' ', 163, 'a', ' ', 165, 'Microsoft', ' ', 175, 'Word', ' ', 180, '(2007', ' ', 186, 'and', ' ', 190, 'newer)', ' ', 197, 'document.', ' ', 207, 'Convert', ' ', 215, 'this', ' ', 220, 'document', ' ', 229, 'to', ' ', 232, 'a', ' ', 234, 'modern', ' ', 241, 'ebook', ' ', 247, 'format,', ' ', 255, 'such', ' ', 260, 'as', ' ', 263, 'AZW3', ' ', 268, 'for', ' ', 272, 'Kindles', ' ', 280, 'or', ' ', 283, 'EPUB', ' ', 288, 'for', ' ', 292, 'other', ' ', 298, 'ebook', ' ', 304, 'readers,', ' ', 313, 'to', ' ', 316, 'see', ' ', 320, 'it', ' ', 323, 'in', ' ', 326, 'action.', '\n\t', 335, 'There', ' ', 341, 'is', ' ', 344, 'support', ' ', 352, 'for', ' ', 356, 'images,', ' ', 364, 'tables,', ' ', 372, 'lists,', ' ', 379, 'footnotes,', ' ', 390, 'endnotes,', ' ', 400, 'links,', ' ', 407, 'dropcaps', ' ', 416, 'and', ' ', 420, 'various', ' ', 428, 'types', ' ', 434, 'of', ' ', 437, 'text', ' ', 442, 'and', ' ', 446, 'paragraph', ' ', 456, 'level', ' ', 462, 'formatting.', '\n\t', 475, 'To', ' ', 478, 'see', ' ', 482, 'the', ' ', 486, 'DOCX', ' ', 491, 'conversion', ' ', 502, 'in', ' ', 505, 'action,', ' ', 513, 'simply', ' ', 520, 'add', ' ', 524, 'this', ' ', 529, 'file', ' ', 534, 'to', ' ', 537, 'calibre', ' ', 545, 'using', ' ', 551, 'the', ' ', 555, '“Add', ' ', 560, 'Books”', ' ', 567, 'button', ' ', 574, 'and', ' ', 578, 'then', ' ', 583, 'click', ' ', 589, '“Convert”.', '  ', 601, 'Set', ' ', 605, 'the', ' ', 609, 'output', ' ', 616, 'format', ' ', 623, 'in', ' ', 626, 'the', ' ', 630, 'top', ' ', 634, 'right', ' ', 640, 'corner', ' ', 647, 'of', ' ', 650, 'the', ' ', 654, 'conversion', ' ', 665, 'dialog', ' ', 672, 'to', ' ', 675, 'EPUB', ' ', 680, 'or', ' ', 683, 'AZW3', ' ', 688, 'and', ' ', 692, 'click', ' ', 698, '“OK”.', '\n\t\xa0\n\t']  # noqa }}}
+
+    from calibre.gui2 import Application
+    app = Application([])
+    app.shutdown_signal_received.connect(lambda: app.exit(1))
+    tts = create_tts_backend(engine_name=engine_name)
+    speech_started = False
+
+    def print_saying(s, e):
+        bits = []
+        in_region = False
+        for x in marked_text:
+            if isinstance(x, int):
+                if in_region:
+                    if x >= e:
+                        break
+                else:
+                    if x == s:
+                        in_region = True
+                    elif x > e:
+                        break
+            elif in_region:
+                bits.append(x)
+        print('Saying:', repr(''.join(bits)))
+
+    import sys
+
+    def state_changed(state):
+        nonlocal speech_started
+        if state == QTextToSpeech.State.Speaking:
+            speech_started = True
+        elif state == QTextToSpeech.State.Error:
+            print(tts.error_message(), file=sys.stderr)
+            app.exit(1)
+        elif state == QTextToSpeech.State.Ready:
+            if speech_started:
+                app.quit()
+    tts.saying.connect(print_saying)
+    tts.state_changed.connect(state_changed)
+    tts.speak_marked_text(marked_text)
+    app.exec()
+
+
+if __name__ == '__main__':
+    develop()