Port config to winspeech

2025-08-30 23:00:21 -04:00 · 2023-02-02 11:22:46 +05:30 · 2023-02-02 11:22:46 +05:30 · c8e9f33736
commit c8e9f33736
parent f64b9e3e2c
4 changed files with 321 additions and 18 deletions
--- a/src/calibre/gui2/tts/windows.py
+++ b/src/calibre/gui2/tts/windows.py
@ -52,8 +52,16 @@ class Client:
        self.synthesizing = False
        self.settings = settings or {}
        self.clear_chunks()
+        self.default_system_audio_device = self.backend.get_audio_device().device
+        self.default_system_voice = self.backend.default_voice().voice
        self.apply_settings()

+    def get_all_voices(self):
+        return self.backend.all_voices().voices
+
+    def get_all_audio_devices(self):
+        return self.backend.all_audio_devices().devices
+
    def __del__(self):
        if self.backend is not None:
            self.backend.shutdown()
@ -63,6 +71,9 @@ class Client:
    def dispatch_msg(self, msg):
        self.dispatch_on_main_thread(partial(self.handle_event, msg))

+    def speak_current_chunk(self):
+        self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
+
    def handle_event(self, x):
        if isinstance(x, MarkReached) and self.current_chunks:
            self.last_mark = x.id
@ -74,7 +85,7 @@ class Client:
                    self.callback_ignoring_errors(Event(EventType.end))
                else:
                    self.current_chunk_idx += 1
-                    self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
+                    self.speak_current_chunk()
            elif x.state is MediaState.failed:
                self.clear_chunks()
                self.callback_ignoring_errors(Event(EventType.cancel))
@ -82,7 +93,8 @@ class Client:
                e.display_to_user = True
                raise e
            elif x.state is MediaState.opened:
-                self.callback_ignoring_errors(Event(EventType.begin))
+                self.callback_ignoring_errors(Event(EventType.resume if self.next_start_is_resume else EventType.begin))
+                self.next_start_is_resume = False
        elif isinstance(x, Error):
            raise x.as_exception(check_for_no_audio_devices=True)
        else:
@ -98,12 +110,11 @@ class Client:
        self.clear_chunks()
        self.current_callback = callback
        self.current_chunks = tuple(split_into_chunks(text, self.chunk_size))
-        self.current_chunk_idx = 0
+        self.current_chunk_idx = -100
        if self.current_chunks:
-            self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
+            self.current_chunk_idx = 0
+            self.speak_current_chunk()
            self.synthesizing = True
-            if self.current_callback is not None:
-                self.current_callback(Event(EventType.begin))

    def callback_ignoring_errors(self, ev):
        if self.current_callback is not None:
@ -115,8 +126,9 @@ class Client:

    def clear_chunks(self):
        self.synthesizing = False
+        self.next_start_is_resume = False
        self.current_chunk_idx = -100
-        self.current_chunks = []
+        self.current_chunks = ()
        self.last_mark = -1

    def stop(self):
@ -138,12 +150,52 @@ class Client:
            self.current_callback(Event(EventType.resume))

    def apply_settings(self, new_settings=None):
-        pass
+        if self.synthesizing:
+            self.stop()
+        if new_settings is not None:
+            self.settings = new_settings
+        try:
+            self.backend.set_rate(self.settings.get('rate', self.default_system_rate))
+        except OSError:
+            self.settings.pop('rate', None)
+        try:
+            self.backend.set_voice(self.settings.get('voice'), self.default_system_voice)
+        except OSError:
+            self.settings.pop('voice', None)
+        try:
+            self.backend.set_audio_device(self.settings.get('sound_output'), self.default_system_audio_device)
+        except OSError:
+            self.settings.pop('sound_output', None)

    def config_widget(self, backend_settings, parent):
        from calibre.gui2.tts.windows_config import Widget
        return Widget(self, backend_settings, parent)

+    def chunks_from_last_mark(self):
+        for i, chunk in enumerate(self.current_chunks):
+            for ci, x in enumerate(chunk):
+                if x == self.last_mark:
+                    chunks = self.current_chunks[i:]
+                    chunk = chunk[ci + 1:]
+                    if chunk:
+                        chunks = (chunk,) + chunks[1:]
+                    else:
+                        chunks = chunks[1:]
+                    return chunks
+        return ()
+
+    def resume_after_configure(self):
+        if not self.synthesizing:
+            return
+        self.current_chunk_idx = -100
+        self.last_mark = -1
+        self.current_chunks = self.chunks_from_last_mark()
+        self.next_start_is_resume = True
+        self.synthesizing = bool(self.current_chunks)
+        if self.current_chunks:
+            self.current_chunk_idx = 0
+            self.speak_current_chunk()
+
    def change_rate(self, steps=1):
        rate = current_rate = self.settings.get('rate', self.default_system_rate)
        if rate < 1:
--- a/src/calibre/gui2/tts/windows_config.py
+++ b/src/calibre/gui2/tts/windows_config.py
@ -0,0 +1,196 @@
+#!/usr/bin/env python
+# License: GPL v3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
+
+from contextlib import suppress
+from qt.core import (
+    QAbstractItemView, QAbstractTableModel, QByteArray, QComboBox, QFontMetrics,
+    QFormLayout, QItemSelectionModel, QSlider, QSortFilterProxyModel, Qt, QTableView,
+    QWidget
+)
+
+from calibre.gui2.widgets import BusyCursor
+
+
+class VoicesModel(QAbstractTableModel):
+
+    system_default_voice = '__default__'
+
+    def __init__(self, voice_data, parent=None):
+        super().__init__(parent)
+        self.voice_data = voice_data
+        self.current_voices = tuple((x.display_name, x.language,  x.gender, x.id) for x in voice_data)
+        self.column_headers = _('Name'), _('Language'), _('Gender')
+
+    def rowCount(self, parent=None):
+        return len(self.current_voices) + 1
+
+    def columnCount(self, parent=None):
+        return len(self.column_headers)
+
+    def headerData(self, section, orientation, role=Qt.ItemDataRole.DisplayRole):
+        if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal:
+            return self.column_headers[section]
+        return super().headerData(section, orientation, role)
+
+    def data(self, index, role=Qt.ItemDataRole.DisplayRole):
+        if role == Qt.ItemDataRole.DisplayRole:
+            row = index.row()
+            with suppress(IndexError):
+                if row == 0:
+                    return (_('System default'), '', '', '')[index.column()]
+                data = self.current_voices[row - 1]
+                col = index.column()
+                ans = data[col] or ''
+                return ans
+        if role == Qt.ItemDataRole.UserRole:
+            row = index.row()
+            with suppress(IndexError):
+                if row == 0:
+                    return self.system_default_voice
+                return self.current_voices[row - 1][3]
+
+    def index_for_voice(self, v):
+        r = 0
+        if v != self.system_default_voice:
+            for i, x in enumerate(self.current_voices):
+                if x[3] == v:
+                    r = i + 1
+                    break
+            else:
+                return
+        return self.index(r, 0)
+
+
+class Widget(QWidget):
+
+    def __init__(self, tts_client, initial_backend_settings=None, parent=None):
+        QWidget.__init__(self, parent)
+        self.l = l = QFormLayout(self)
+        self.tts_client = tts_client
+
+        with BusyCursor():
+            self.voice_data = self.tts_client.get_all_voices()
+            self.default_system_rate = self.tts_client.default_system_rate
+            self.all_sound_outputs = self.tts_client.get_all_audio_devices()
+            self.default_system_audio_device = self.tts_client.default_system_audio_device
+
+        self.speed = s = QSlider(Qt.Orientation.Horizontal, self)
+        s.setMinimumWidth(200)
+        l.addRow(_('&Speed of speech:'), s)
+        s.setRange(int(self.tts_client.min_rate * 100), int(100 * self.tts_client.max_rate))
+        s.setSingleStep(10)
+        s.setPageStep(40)
+
+        self.voices = v = QTableView(self)
+        self.voices_model = VoicesModel(self.voice_data, parent=v)
+        self.proxy_model = p = QSortFilterProxyModel(self)
+        p.setFilterCaseSensitivity(Qt.CaseSensitivity.CaseInsensitive)
+        p.setSourceModel(self.voices_model)
+        v.setModel(p)
+        v.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
+        v.setSortingEnabled(True)
+        v.horizontalHeader().resizeSection(0, QFontMetrics(self.font()).averageCharWidth() * 25)
+        v.horizontalHeader().resizeSection(1, QFontMetrics(self.font()).averageCharWidth() * 30)
+        v.verticalHeader().close()
+        v.verticalHeader().close()
+        v.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection)
+        v.sortByColumn(0, Qt.SortOrder.AscendingOrder)
+        l.addRow(v)
+
+        self.sound_outputs = so = QComboBox(self)
+        so.addItem(_('System default'), ())
+        for x in self.all_sound_outputs:
+            so.addItem(x.name, x.spec())
+        l.addRow(_('Sound output:'), so)
+
+        self.backend_settings = initial_backend_settings or {}
+
+    def restore_state(self, prefs):
+        data = prefs.get(f'{self.tts_client.name}-voice-table-state')
+        if data is not None:
+            self.voices.horizontalHeader().restoreState(QByteArray(data))
+
+    def save_state(self, prefs):
+        data = bytearray(self.voices.horizontalHeader().saveState())
+        prefs.set(f'{self.tts_client.name}-voice-table-state', data)
+
+    def restore_to_defaults(self):
+        self.backend_settings = {}
+
+    def sizeHint(self):
+        ans = super().sizeHint()
+        ans.setHeight(max(ans.height(), 600))
+        ans.setWidth(max(ans.width(), 500))
+        return ans
+
+    @property
+    def selected_voice(self):
+        for x in self.voices.selectedIndexes():
+            return x.data(Qt.ItemDataRole.UserRole)
+
+    @selected_voice.setter
+    def selected_voice(self, val):
+        val = val or VoicesModel.system_default_voice
+        idx = self.voices_model.index_for_voice(val)
+        if idx is not None:
+            idx = self.proxy_model.mapFromSource(idx)
+            self.voices.selectionModel().select(idx, QItemSelectionModel.SelectionFlag.ClearAndSelect | QItemSelectionModel.SelectionFlag.Rows)
+            self.voices.scrollTo(idx)
+
+    @property
+    def rate(self):
+        return self.speed.value() / 100
+
+    @rate.setter
+    def rate(self, val):
+        val = int((val or self.default_system_rate) * 100)
+        self.speed.setValue(val)
+
+    @property
+    def sound_output(self):
+        return self.sound_outputs.currentData()
+
+    @sound_output.setter
+    def sound_output(self, val):
+        idx = 0
+        if val:
+            q = self.sound_outputs.findData(val)
+            if q > -1:
+                idx = q
+        self.sound_outputs.setCurrentIndex(idx)
+
+    @property
+    def backend_settings(self):
+        ans = {}
+        voice = self.selected_voice
+        if voice and voice != VoicesModel.system_default_voice:
+            ans['voice'] = voice
+        rate = self.rate
+        if rate and rate != self.default_system_rate:
+            ans['rate'] = rate
+        so = self.sound_output
+        if so:
+            ans['sound_output'] = so
+        return ans
+
+    @backend_settings.setter
+    def backend_settings(self, val):
+        voice = val.get('voice') or VoicesModel.system_default_voice
+        self.selected_voice = voice
+        self.rate = val.get('rate', self.default_system_rate)
+        self.sound_output = val.get('sound_output') or ()
+
+
+def develop():
+    from calibre.gui2 import Application
+    from calibre.gui2.tts.implementation import Client
+    app = Application([])
+    c = Client()
+    w = Widget(c, {})
+    w.show()
+    app.exec()
+    print(w.backend_settings)
+
+
+if __name__ == '__main__':
+    develop()
--- a/src/calibre/utils/windows/winspeech.cpp
+++ b/src/calibre/utils/windows/winspeech.cpp
@ -756,6 +756,9 @@ static const std::unordered_map<std::string, handler_function> handlers = {
        bool found = false;
        if (parts.size()) {
            auto voice_id = winrt::hstring(parts.at(0));
+            if (voice_id == L"__default__") {
+                voice_id = SpeechSynthesizer::DefaultVoice().Id();
+            }
            for (auto const &candidate : SpeechSynthesizer::AllVoices()) {
                if (candidate.Id() == voice_id) {
                    speech_synthesizer.Voice(candidate);
@ -765,8 +768,8 @@ static const std::unordered_map<std::string, handler_function> handlers = {
            }
        }
        auto x = speech_synthesizer.Voice();
-        if (x) output(cmd_id, "voice", {{"value", speech_synthesizer.Voice()}, {"found", found}});
-        else output(cmd_id, "voice", {{"value", ""}, {"found", found}});
+        if (x) output(cmd_id, "voice", {{"voice", speech_synthesizer.Voice()}, {"found", found}});
+        else output(cmd_id, "voice", {{"voice", ""}, {"found", found}});
    }},

    {"volume", [](id_type cmd_id, std::vector<std::wstring_view> parts, int64_t*) {
--- a/src/calibre/utils/windows/winspeech.py
+++ b/src/calibre/utils/windows/winspeech.py
@ -12,7 +12,7 @@ from itertools import count
 from queue import Empty, Queue
 from threading import Thread
 from time import monotonic
-from typing import NamedTuple, Tuple
+from typing import NamedTuple, Tuple, Optional

 from calibre.constants import DEBUG
 from calibre.utils.ipc.simple_worker import start_pipe_worker
@ -101,11 +101,12 @@ class SpeechError(OSError):
            val += f'{msg}. '
        val += err.msg + ': ' + err.error + f'\nFile: {err.file} Line: {err.line}'
        if err.hr:
+            # List of mediaserver errors is here: https://www.hresult.info/FACILITY_MEDIASERVER
            val += f' HRESULT: 0x{err.hr:x}'
        super().__init__(val)


-class NoAudioDevices(Exception):
+class NoAudioDevices(OSError):
    def __init__(self):
        super().__init__(_('No active audio output devices found.'
                           ' Connect headphones or speakers. If you are using Remote Desktop then enable Remote Audio for it.'))
@ -212,7 +213,7 @@ class DefaultVoice(NamedTuple):

 class Voice(NamedTuple):
    related_to: int
-    voice: VoiceInformation
+    voice: Optional[VoiceInformation]
    found: bool = True


@ -223,13 +224,21 @@ class DeviceInformation(NamedTuple):
    is_default: bool
    is_enabled: bool

+    def spec(self) -> Tuple[str, str]:
+        return self.kind, self.id
+

 class AudioDevice(NamedTuple):
    related_to: int
-    device: DeviceInformation
+    device: Optional[DeviceInformation]
    found: bool = True


+class AllAudioDevices(NamedTuple):
+    related_to: int
+    devices: Tuple[DeviceInformation, ...]
+
+
 class AllVoices(NamedTuple):
    related_to: int
    voices: Tuple[VoiceInformation, ...]
@ -301,11 +310,18 @@ def parse_message(line):
        return AllVoices(**ans)
    if msg_type == 'all_audio_devices':
        ans['devices'] = tuple(DeviceInformation(**x) for x in ans['devices'])
-        return AudioDevice(**ans)
+        return AllAudioDevices(**ans)
    if msg_type == 'audio_device':
+        if ans['device']:
+            ans['device'] = DeviceInformation(ans['device'])
+        else:
+            ans['device'] = None
        return AudioDevice(**ans)
    if msg_type == 'voice':
+        if ans['voice']:
            ans['voice'] = VoiceInformation(**ans['voice'])
+        else:
+            ans['voice'] = None
        return Voice(**ans)
    if msg_type == 'volume':
        return Volume(**ans)
@ -357,7 +373,7 @@ class WinSpeech:
                line = line.strip()
                if DEBUG:
                    with suppress(Exception):
-                        print('winspeech:', line.decode('utf-8', 'replace'), flush=True)
+                        print('winspeech:\x1b[32m<-\x1b[39m', line.decode('utf-8', 'replace'), flush=True)
                send_msg(parse_message(line))
        except OSError as e:
            send_msg(Error('Failed to read from worker', str(e)))
@ -367,7 +383,11 @@ class WinSpeech:
    def send_command(self, cmd):
        cmd_id = next(self.msg_id_counter)
        w = self.worker
-        w.stdin.write(f'{cmd_id} {cmd}\n'.encode('utf-8'))
+        cmd = f'{cmd_id} {cmd}'
+        if DEBUG:
+            with suppress(Exception):
+                print('winspeech:\x1b[31m->\x1b[39m', cmd, flush=True)
+        w.stdin.write(f'{cmd}\n'.encode('utf-8'))
        w.stdin.flush()
        return cmd_id

@ -410,6 +430,38 @@ class WinSpeech:
    def play(self):
        self.wait_for('play', Play, related_to=self.send_command('play'))

+    def set_rate(self, val):
+        val = float(val)
+        self.wait_for('Setting the rate', Rate, related_to=self.send_command(f'rate {val}'))
+
+    def set_voice(self, spec, default_system_voice):
+        val = spec or getattr(default_system_voice, 'id', '__default__')
+        x = self.wait_for('Setting the voice', Voice, related_to=self.send_command(f'voice {val}'))
+        if not x.found:
+            raise KeyError(f'Failed to find the voice: {val}')
+
+    def set_audio_device(self, spec, default_system_audio_device):
+        if not spec and not default_system_audio_device:
+            return
+        if not spec:
+            spec = default_system_audio_device.spec()
+        x = self.wait_for('Setting the audio device', AudioDevice, related_to=self.send_command(f'audio_device {spec[0]} {spec[1]}'))
+        if not x.found:
+            raise KeyError(f'Failed to find the audio device: {spec}')
+
+    def get_audio_device(self):
+        return self.wait_for('Audio device', AudioDevice, related_to=self.send_command('audio_device'))
+
+    def default_voice(self):
+        return self.wait_for('Default voice', DefaultVoice, related_to=self.send_command('default_voice'))
+
+    def all_voices(self):
+        return self.wait_for('All voices', AllVoices, related_to=self.send_command('all_voices'))
+
+    def all_audio_devices(self):
+        return self.wait_for('All audio devices', AllAudioDevices, related_to=self.send_command('all_audio_devices'))
+
+

 # develop {{{
 def develop_loop(*commands):