From 151f208dd4c0b0b032ff71384d55985f3939ea8c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 24 Aug 2024 16:16:28 +0530
Subject: [PATCH] Start work on new Qt based TTS backend

---
 src/calibre/gui2/tts2/__init__.py |  2 +
 src/calibre/gui2/tts2/qt.py       | 19 +++++++
 src/calibre/gui2/tts2/types.py    | 91 +++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)
 create mode 100644 src/calibre/gui2/tts2/__init__.py
 create mode 100644 src/calibre/gui2/tts2/qt.py
 create mode 100644 src/calibre/gui2/tts2/types.py
diff --git a/src/calibre/gui2/tts2/__init__.py b/src/calibre/gui2/tts2/__init__.py
new file mode 100644
index 0000000000..10baddb581
--- /dev/null
+++ b/src/calibre/gui2/tts2/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
diff --git a/src/calibre/gui2/tts2/qt.py b/src/calibre/gui2/tts2/qt.py
new file mode 100644
index 0000000000..723985422f
--- /dev/null
+++ b/src/calibre/gui2/tts2/qt.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
+
+from qt.core import QMediaDevices, QObject, QTextToSpeech
+
+from .types import EngineSpecificSettings
+
+
+class QtTTSBackend(QObject):
+
+    def __init__(self, engine_name: str = '', settings: EngineSpecificSettings = EngineSpecificSettings(), parent: QObject|None = None):
+        super().__init__(parent)
+        s = {}
+        if settings.audio_device_id:
+            for x in QMediaDevices.audioOutputs():
+                if bytes(x.id) == settings.audio_device_id.id:
+                    s['audioDevice'] = x
+                    break
+        self.tts = QTextToSpeech(engine_name, s, self)
diff --git a/src/calibre/gui2/tts2/types.py b/src/calibre/gui2/tts2/types.py
new file mode 100644
index 0000000000..741eaeafe2
--- /dev/null
+++ b/src/calibre/gui2/tts2/types.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>
+
+from enum import Enum, auto
+from functools import lru_cache
+from typing import Literal, NamedTuple
+
+from qt.core import QLocale, QTextToSpeech, QVoice
+
+from calibre.utils.localization import canonicalize_lang
+
+
+class TrackingCapability(Enum):
+    NoTracking: int = auto()
+    WordByWord: int = auto()
+    Sentence: int = auto()
+
+
+class EngineMetadata(NamedTuple):
+    name: Literal['winrt', 'darwin', 'sapi', 'flite', 'speechd']
+    tracking_capability: TrackingCapability = TrackingCapability.NoTracking
+    allows_choosing_audio_device: bool = True
+    can_synthesize_audio_data: bool = True
+
+
+class Quality(Enum):
+    High: int = auto()
+    Medium: int = auto()
+    Low: int = auto()
+
+
+class Voice(NamedTuple):
+    name: str
+    language_code: str
+    country_code: str
+
+    human_name: str = ''
+    notes: str = ''
+    gender: QVoice.Gender = QVoice.Gender.Unknown
+    age: QVoice.Age = QVoice.Age.Other
+    quality: Quality = Quality.High
+
+
+def qvoice_to_voice(v: QVoice) -> QVoice:
+    lang = canonicalize_lang(QLocale.languageToCode(v.language())) or 'und'
+    country = QLocale.territoryToString(v.locale().territory())
+    return Voice(v.name(), lang, country, gender=v.gender(), age=v.age())
+
+
+class AudioDeviceId(NamedTuple):
+    id: bytes
+    description: str
+
+
+class EngineSpecificSettings(NamedTuple):
+    audio_device_id: AudioDeviceId | None = None
+    voice_name: str = ''
+    rate: float = 0  # -1 to 1 0 is normal speech
+    pitch: float = 0  # -1 to 1 0 is normal speech
+    volume: float | None = None  # 0 to 1, None is platform default volume
+
+
+
+@lru_cache(2)
+def available_engines() -> dict[str, EngineMetadata]:
+    ans = {}
+    e = QTextToSpeech()
+
+    def qt_engine_metadata(name: str, allows_choosing_audio_device: bool = False) -> EngineMetadata:
+        e.setEngine(name)
+        cap = e.engineCapabilities()
+        return EngineMetadata(
+            name, TrackingCapability.WordByWord if cap & QTextToSpeech.Capability.WordByWordProgress else TrackingCapability.NoTracking,
+            allows_choosing_audio_device, cap & QTextToSpeech.Capability.Synthesize)
+
+    for x in QTextToSpeech.availableEngines():
+        if x == 'winrt':
+            ans[x] = qt_engine_metadata(x, True)
+        elif x == 'darwin':
+            ans[x] = qt_engine_metadata(x)
+        elif x == 'sapi':
+            ans[x] = qt_engine_metadata(x)
+        elif x == 'macos':
+            # this is slated for removal in Qt 6.8 so skip it
+            continue
+        elif x == 'flite':
+            ans[x] = qt_engine_metadata(x, True)
+        elif x == 'speechd':
+            # TODO: Replace this with our own speechd client that supports word tracking
+            ans[x] = qt_engine_metadata(x)
+    return ans