Port config to winspeech

2025-07-09 03:04:10 -04:00 · 2023-02-02 11:22:46 +05:30 · 2023-02-02 11:22:46 +05:30 · c8e9f33736
commit c8e9f33736
parent f64b9e3e2c
4 changed files with 321 additions and 18 deletions
--- a/src/calibre/gui2/tts/windows.py
+++ b/src/calibre/gui2/tts/windows.py
@ -52,8 +52,16 @@ class Client:
        self.synthesizing = False
        self.settings = settings or {}
        self.clear_chunks()
        self.default_system_audio_device = self.backend.get_audio_device().device
        self.default_system_voice = self.backend.default_voice().voice
        self.apply_settings()
    def get_all_voices(self):
        return self.backend.all_voices().voices
    def get_all_audio_devices(self):
        return self.backend.all_audio_devices().devices
    def __del__(self):
        if self.backend is not None:
            self.backend.shutdown()
@ -63,6 +71,9 @@ class Client:
    def dispatch_msg(self, msg):
        self.dispatch_on_main_thread(partial(self.handle_event, msg))
    def speak_current_chunk(self):
        self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
    def handle_event(self, x):
        if isinstance(x, MarkReached) and self.current_chunks:
            self.last_mark = x.id
@ -74,7 +85,7 @@ class Client:
                    self.callback_ignoring_errors(Event(EventType.end))
                else:
                    self.current_chunk_idx += 1
-                    self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
+                    self.speak_current_chunk()
            elif x.state is MediaState.failed:
                self.clear_chunks()
                self.callback_ignoring_errors(Event(EventType.cancel))
@ -82,7 +93,8 @@ class Client:
                e.display_to_user = True
                raise e
            elif x.state is MediaState.opened:
-                self.callback_ignoring_errors(Event(EventType.begin))
+                self.callback_ignoring_errors(Event(EventType.resume if self.next_start_is_resume else EventType.begin))
                self.next_start_is_resume = False
        elif isinstance(x, Error):
            raise x.as_exception(check_for_no_audio_devices=True)
        else:
@ -98,12 +110,11 @@ class Client:
        self.clear_chunks()
        self.current_callback = callback
        self.current_chunks = tuple(split_into_chunks(text, self.chunk_size))
-        self.current_chunk_idx = 0
+        self.current_chunk_idx = -100
        if self.current_chunks:
-            self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
+            self.current_chunk_idx = 0
            self.speak_current_chunk()
            self.synthesizing = True
            if self.current_callback is not None:
                self.current_callback(Event(EventType.begin))
    def callback_ignoring_errors(self, ev):
        if self.current_callback is not None:
@ -115,8 +126,9 @@ class Client:
    def clear_chunks(self):
        self.synthesizing = False
        self.next_start_is_resume = False
        self.current_chunk_idx = -100
-        self.current_chunks = []
+        self.current_chunks = ()
        self.last_mark = -1
    def stop(self):
@ -138,12 +150,52 @@ class Client:
            self.current_callback(Event(EventType.resume))
    def apply_settings(self, new_settings=None):
-        pass
+        if self.synthesizing:
            self.stop()
        if new_settings is not None:
            self.settings = new_settings
        try:
            self.backend.set_rate(self.settings.get('rate', self.default_system_rate))
        except OSError:
            self.settings.pop('rate', None)
        try:
            self.backend.set_voice(self.settings.get('voice'), self.default_system_voice)
        except OSError:
            self.settings.pop('voice', None)
        try:
            self.backend.set_audio_device(self.settings.get('sound_output'), self.default_system_audio_device)
        except OSError:
            self.settings.pop('sound_output', None)
    def config_widget(self, backend_settings, parent):
        from calibre.gui2.tts.windows_config import Widget
        return Widget(self, backend_settings, parent)
    def chunks_from_last_mark(self):
        for i, chunk in enumerate(self.current_chunks):
            for ci, x in enumerate(chunk):
                if x == self.last_mark:
                    chunks = self.current_chunks[i:]
                    chunk = chunk[ci + 1:]
                    if chunk:
                        chunks = (chunk,) + chunks[1:]
                    else:
                        chunks = chunks[1:]
                    return chunks
        return ()
    def resume_after_configure(self):
        if not self.synthesizing:
            return
        self.current_chunk_idx = -100
        self.last_mark = -1
        self.current_chunks = self.chunks_from_last_mark()
        self.next_start_is_resume = True
        self.synthesizing = bool(self.current_chunks)
        if self.current_chunks:
            self.current_chunk_idx = 0
            self.speak_current_chunk()
    def change_rate(self, steps=1):
        rate = current_rate = self.settings.get('rate', self.default_system_rate)
        if rate < 1:
--- a/src/calibre/gui2/tts/windows_config.py
+++ b/src/calibre/gui2/tts/windows_config.py
@ -0,0 +1,196 @@
 #!/usr/bin/env python
 # License: GPL v3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
 from contextlib import suppress
 from qt.core import (
    QAbstractItemView, QAbstractTableModel, QByteArray, QComboBox, QFontMetrics,
    QFormLayout, QItemSelectionModel, QSlider, QSortFilterProxyModel, Qt, QTableView,
    QWidget
 )
 from calibre.gui2.widgets import BusyCursor
 class VoicesModel(QAbstractTableModel):
    system_default_voice = '__default__'
    def __init__(self, voice_data, parent=None):
        super().__init__(parent)
        self.voice_data = voice_data
        self.current_voices = tuple((x.display_name, x.language,  x.gender, x.id) for x in voice_data)
        self.column_headers = _('Name'), _('Language'), _('Gender')
    def rowCount(self, parent=None):
        return len(self.current_voices) + 1
    def columnCount(self, parent=None):
        return len(self.column_headers)
    def headerData(self, section, orientation, role=Qt.ItemDataRole.DisplayRole):
        if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal:
            return self.column_headers[section]
        return super().headerData(section, orientation, role)
    def data(self, index, role=Qt.ItemDataRole.DisplayRole):
        if role == Qt.ItemDataRole.DisplayRole:
            row = index.row()
            with suppress(IndexError):
                if row == 0:
                    return (_('System default'), '', '', '')[index.column()]
                data = self.current_voices[row - 1]
                col = index.column()
                ans = data[col] or ''
                return ans
        if role == Qt.ItemDataRole.UserRole:
            row = index.row()
            with suppress(IndexError):
                if row == 0:
                    return self.system_default_voice
                return self.current_voices[row - 1][3]
    def index_for_voice(self, v):
        r = 0
        if v != self.system_default_voice:
            for i, x in enumerate(self.current_voices):
                if x[3] == v:
                    r = i + 1
                    break
            else:
                return
        return self.index(r, 0)
 class Widget(QWidget):
    def __init__(self, tts_client, initial_backend_settings=None, parent=None):
        QWidget.__init__(self, parent)
        self.l = l = QFormLayout(self)
        self.tts_client = tts_client
        with BusyCursor():
            self.voice_data = self.tts_client.get_all_voices()
            self.default_system_rate = self.tts_client.default_system_rate
            self.all_sound_outputs = self.tts_client.get_all_audio_devices()
            self.default_system_audio_device = self.tts_client.default_system_audio_device
        self.speed = s = QSlider(Qt.Orientation.Horizontal, self)
        s.setMinimumWidth(200)
        l.addRow(_('&Speed of speech:'), s)
        s.setRange(int(self.tts_client.min_rate * 100), int(100 * self.tts_client.max_rate))
        s.setSingleStep(10)
        s.setPageStep(40)
        self.voices = v = QTableView(self)
        self.voices_model = VoicesModel(self.voice_data, parent=v)
        self.proxy_model = p = QSortFilterProxyModel(self)
        p.setFilterCaseSensitivity(Qt.CaseSensitivity.CaseInsensitive)
        p.setSourceModel(self.voices_model)
        v.setModel(p)
        v.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
        v.setSortingEnabled(True)
        v.horizontalHeader().resizeSection(0, QFontMetrics(self.font()).averageCharWidth() * 25)
        v.horizontalHeader().resizeSection(1, QFontMetrics(self.font()).averageCharWidth() * 30)
        v.verticalHeader().close()
        v.verticalHeader().close()
        v.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection)
        v.sortByColumn(0, Qt.SortOrder.AscendingOrder)
        l.addRow(v)
        self.sound_outputs = so = QComboBox(self)
        so.addItem(_('System default'), ())
        for x in self.all_sound_outputs:
            so.addItem(x.name, x.spec())
        l.addRow(_('Sound output:'), so)
        self.backend_settings = initial_backend_settings or {}
    def restore_state(self, prefs):
        data = prefs.get(f'{self.tts_client.name}-voice-table-state')
        if data is not None:
            self.voices.horizontalHeader().restoreState(QByteArray(data))
    def save_state(self, prefs):
        data = bytearray(self.voices.horizontalHeader().saveState())
        prefs.set(f'{self.tts_client.name}-voice-table-state', data)
    def restore_to_defaults(self):
        self.backend_settings = {}
    def sizeHint(self):
        ans = super().sizeHint()
        ans.setHeight(max(ans.height(), 600))
        ans.setWidth(max(ans.width(), 500))
        return ans
    @property
    def selected_voice(self):
        for x in self.voices.selectedIndexes():
            return x.data(Qt.ItemDataRole.UserRole)
    @selected_voice.setter
    def selected_voice(self, val):
        val = val or VoicesModel.system_default_voice
        idx = self.voices_model.index_for_voice(val)
        if idx is not None:
            idx = self.proxy_model.mapFromSource(idx)
            self.voices.selectionModel().select(idx, QItemSelectionModel.SelectionFlag.ClearAndSelect | QItemSelectionModel.SelectionFlag.Rows)
            self.voices.scrollTo(idx)
    @property
    def rate(self):
        return self.speed.value() / 100
    @rate.setter
    def rate(self, val):
        val = int((val or self.default_system_rate) * 100)
        self.speed.setValue(val)
    @property
    def sound_output(self):
        return self.sound_outputs.currentData()
    @sound_output.setter
    def sound_output(self, val):
        idx = 0
        if val:
            q = self.sound_outputs.findData(val)
            if q > -1:
                idx = q
        self.sound_outputs.setCurrentIndex(idx)
    @property
    def backend_settings(self):
        ans = {}
        voice = self.selected_voice
        if voice and voice != VoicesModel.system_default_voice:
            ans['voice'] = voice
        rate = self.rate
        if rate and rate != self.default_system_rate:
            ans['rate'] = rate
        so = self.sound_output
        if so:
            ans['sound_output'] = so
        return ans
    @backend_settings.setter
    def backend_settings(self, val):
        voice = val.get('voice') or VoicesModel.system_default_voice
        self.selected_voice = voice
        self.rate = val.get('rate', self.default_system_rate)
        self.sound_output = val.get('sound_output') or ()
 def develop():
    from calibre.gui2 import Application
    from calibre.gui2.tts.implementation import Client
    app = Application([])
    c = Client()
    w = Widget(c, {})
    w.show()
    app.exec()
    print(w.backend_settings)
 if __name__ == '__main__':
    develop()
--- a/src/calibre/utils/windows/winspeech.cpp
+++ b/src/calibre/utils/windows/winspeech.cpp
@ -756,6 +756,9 @@ static const std::unordered_map<std::string, handler_function> handlers = {
        bool found = false;
        if (parts.size()) {
            auto voice_id = winrt::hstring(parts.at(0));
            if (voice_id == L"__default__") {
                voice_id = SpeechSynthesizer::DefaultVoice().Id();
            }
            for (auto const &candidate : SpeechSynthesizer::AllVoices()) {
                if (candidate.Id() == voice_id) {
                    speech_synthesizer.Voice(candidate);
@ -765,8 +768,8 @@ static const std::unordered_map<std::string, handler_function> handlers = {
            }
        }
        auto x = speech_synthesizer.Voice();
-        if (x) output(cmd_id, "voice", {{"value", speech_synthesizer.Voice()}, {"found", found}});
+        if (x) output(cmd_id, "voice", {{"voice", speech_synthesizer.Voice()}, {"found", found}});
-        else output(cmd_id, "voice", {{"value", ""}, {"found", found}});
+        else output(cmd_id, "voice", {{"voice", ""}, {"found", found}});
    }},
    {"volume", [](id_type cmd_id, std::vector<std::wstring_view> parts, int64_t*) {
--- a/src/calibre/utils/windows/winspeech.py
+++ b/src/calibre/utils/windows/winspeech.py
@ -12,7 +12,7 @@ from itertools import count
 from queue import Empty, Queue
 from threading import Thread
 from time import monotonic
-from typing import NamedTuple, Tuple
+from typing import NamedTuple, Tuple, Optional
 from calibre.constants import DEBUG
 from calibre.utils.ipc.simple_worker import start_pipe_worker
@ -101,11 +101,12 @@ class SpeechError(OSError):
            val += f'{msg}. '
        val += err.msg + ': ' + err.error + f'\nFile: {err.file} Line: {err.line}'
        if err.hr:
            # List of mediaserver errors is here: https://www.hresult.info/FACILITY_MEDIASERVER
            val += f' HRESULT: 0x{err.hr:x}'
        super().__init__(val)
-class NoAudioDevices(Exception):
+class NoAudioDevices(OSError):
    def __init__(self):
        super().__init__(_('No active audio output devices found.'
                           ' Connect headphones or speakers. If you are using Remote Desktop then enable Remote Audio for it.'))
@ -212,7 +213,7 @@ class DefaultVoice(NamedTuple):
 class Voice(NamedTuple):
    related_to: int
-    voice: VoiceInformation
+    voice: Optional[VoiceInformation]
    found: bool = True
@ -223,13 +224,21 @@ class DeviceInformation(NamedTuple):
    is_default: bool
    is_enabled: bool
    def spec(self) -> Tuple[str, str]:
        return self.kind, self.id
 class AudioDevice(NamedTuple):
    related_to: int
-    device: DeviceInformation
+    device: Optional[DeviceInformation]
    found: bool = True
 class AllAudioDevices(NamedTuple):
    related_to: int
    devices: Tuple[DeviceInformation, ...]
 class AllVoices(NamedTuple):
    related_to: int
    voices: Tuple[VoiceInformation, ...]
@ -301,11 +310,18 @@ def parse_message(line):
        return AllVoices(**ans)
    if msg_type == 'all_audio_devices':
        ans['devices'] = tuple(DeviceInformation(**x) for x in ans['devices'])
-        return AudioDevice(**ans)
+        return AllAudioDevices(**ans)
    if msg_type == 'audio_device':
        if ans['device']:
            ans['device'] = DeviceInformation(ans['device'])
        else:
            ans['device'] = None
        return AudioDevice(**ans)
    if msg_type == 'voice':
-        ans['voice'] = VoiceInformation(**ans['voice'])
+        if ans['voice']:
            ans['voice'] = VoiceInformation(**ans['voice'])
        else:
            ans['voice'] = None
        return Voice(**ans)
    if msg_type == 'volume':
        return Volume(**ans)
@ -357,7 +373,7 @@ class WinSpeech:
                line = line.strip()
                if DEBUG:
                    with suppress(Exception):
-                        print('winspeech:', line.decode('utf-8', 'replace'), flush=True)
+                        print('winspeech:\x1b[32m<-\x1b[39m', line.decode('utf-8', 'replace'), flush=True)
                send_msg(parse_message(line))
        except OSError as e:
            send_msg(Error('Failed to read from worker', str(e)))
@ -367,7 +383,11 @@ class WinSpeech:
    def send_command(self, cmd):
        cmd_id = next(self.msg_id_counter)
        w = self.worker
-        w.stdin.write(f'{cmd_id} {cmd}\n'.encode('utf-8'))
+        cmd = f'{cmd_id} {cmd}'
        if DEBUG:
            with suppress(Exception):
                print('winspeech:\x1b[31m->\x1b[39m', cmd, flush=True)
        w.stdin.write(f'{cmd}\n'.encode('utf-8'))
        w.stdin.flush()
        return cmd_id
@ -410,6 +430,38 @@ class WinSpeech:
    def play(self):
        self.wait_for('play', Play, related_to=self.send_command('play'))
    def set_rate(self, val):
        val = float(val)
        self.wait_for('Setting the rate', Rate, related_to=self.send_command(f'rate {val}'))
    def set_voice(self, spec, default_system_voice):
        val = spec or getattr(default_system_voice, 'id', '__default__')
        x = self.wait_for('Setting the voice', Voice, related_to=self.send_command(f'voice {val}'))
        if not x.found:
            raise KeyError(f'Failed to find the voice: {val}')
    def set_audio_device(self, spec, default_system_audio_device):
        if not spec and not default_system_audio_device:
            return
        if not spec:
            spec = default_system_audio_device.spec()
        x = self.wait_for('Setting the audio device', AudioDevice, related_to=self.send_command(f'audio_device {spec[0]} {spec[1]}'))
        if not x.found:
            raise KeyError(f'Failed to find the audio device: {spec}')
    def get_audio_device(self):
        return self.wait_for('Audio device', AudioDevice, related_to=self.send_command('audio_device'))
    def default_voice(self):
        return self.wait_for('Default voice', DefaultVoice, related_to=self.send_command('default_voice'))
    def all_voices(self):
        return self.wait_for('All voices', AllVoices, related_to=self.send_command('all_voices'))
    def all_audio_devices(self):
        return self.wait_for('All audio devices', AllAudioDevices, related_to=self.send_command('all_audio_devices'))
 # develop {{{
 def develop_loop(*commands):