diff --git a/src/calibre/gui2/tts/windows.py b/src/calibre/gui2/tts/windows.py index 2709a9d8e2..d430cbe3db 100644 --- a/src/calibre/gui2/tts/windows.py +++ b/src/calibre/gui2/tts/windows.py @@ -52,8 +52,16 @@ class Client: self.synthesizing = False self.settings = settings or {} self.clear_chunks() + self.default_system_audio_device = self.backend.get_audio_device().device + self.default_system_voice = self.backend.default_voice().voice self.apply_settings() + def get_all_voices(self): + return self.backend.all_voices().voices + + def get_all_audio_devices(self): + return self.backend.all_audio_devices().devices + def __del__(self): if self.backend is not None: self.backend.shutdown() @@ -63,6 +71,9 @@ class Client: def dispatch_msg(self, msg): self.dispatch_on_main_thread(partial(self.handle_event, msg)) + def speak_current_chunk(self): + self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True) + def handle_event(self, x): if isinstance(x, MarkReached) and self.current_chunks: self.last_mark = x.id @@ -74,7 +85,7 @@ class Client: self.callback_ignoring_errors(Event(EventType.end)) else: self.current_chunk_idx += 1 - self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True) + self.speak_current_chunk() elif x.state is MediaState.failed: self.clear_chunks() self.callback_ignoring_errors(Event(EventType.cancel)) @@ -82,7 +93,8 @@ class Client: e.display_to_user = True raise e elif x.state is MediaState.opened: - self.callback_ignoring_errors(Event(EventType.begin)) + self.callback_ignoring_errors(Event(EventType.resume if self.next_start_is_resume else EventType.begin)) + self.next_start_is_resume = False elif isinstance(x, Error): raise x.as_exception(check_for_no_audio_devices=True) else: @@ -98,12 +110,11 @@ class Client: self.clear_chunks() self.current_callback = callback self.current_chunks = tuple(split_into_chunks(text, self.chunk_size)) - self.current_chunk_idx = 0 + self.current_chunk_idx = -100 if self.current_chunks: - self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True) + self.current_chunk_idx = 0 + self.speak_current_chunk() self.synthesizing = True - if self.current_callback is not None: - self.current_callback(Event(EventType.begin)) def callback_ignoring_errors(self, ev): if self.current_callback is not None: @@ -115,8 +126,9 @@ class Client: def clear_chunks(self): self.synthesizing = False + self.next_start_is_resume = False self.current_chunk_idx = -100 - self.current_chunks = [] + self.current_chunks = () self.last_mark = -1 def stop(self): @@ -138,12 +150,52 @@ class Client: self.current_callback(Event(EventType.resume)) def apply_settings(self, new_settings=None): - pass + if self.synthesizing: + self.stop() + if new_settings is not None: + self.settings = new_settings + try: + self.backend.set_rate(self.settings.get('rate', self.default_system_rate)) + except OSError: + self.settings.pop('rate', None) + try: + self.backend.set_voice(self.settings.get('voice'), self.default_system_voice) + except OSError: + self.settings.pop('voice', None) + try: + self.backend.set_audio_device(self.settings.get('sound_output'), self.default_system_audio_device) + except OSError: + self.settings.pop('sound_output', None) def config_widget(self, backend_settings, parent): from calibre.gui2.tts.windows_config import Widget return Widget(self, backend_settings, parent) + def chunks_from_last_mark(self): + for i, chunk in enumerate(self.current_chunks): + for ci, x in enumerate(chunk): + if x == self.last_mark: + chunks = self.current_chunks[i:] + chunk = chunk[ci + 1:] + if chunk: + chunks = (chunk,) + chunks[1:] + else: + chunks = chunks[1:] + return chunks + return () + + def resume_after_configure(self): + if not self.synthesizing: + return + self.current_chunk_idx = -100 + self.last_mark = -1 + self.current_chunks = self.chunks_from_last_mark() + self.next_start_is_resume = True + self.synthesizing = bool(self.current_chunks) + if self.current_chunks: + self.current_chunk_idx = 0 + self.speak_current_chunk() + def change_rate(self, steps=1): rate = current_rate = self.settings.get('rate', self.default_system_rate) if rate < 1: diff --git a/src/calibre/gui2/tts/windows_config.py b/src/calibre/gui2/tts/windows_config.py new file mode 100644 index 0000000000..727f17e18f --- /dev/null +++ b/src/calibre/gui2/tts/windows_config.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# License: GPL v3 Copyright: 2020, Kovid Goyal + +from contextlib import suppress +from qt.core import ( + QAbstractItemView, QAbstractTableModel, QByteArray, QComboBox, QFontMetrics, + QFormLayout, QItemSelectionModel, QSlider, QSortFilterProxyModel, Qt, QTableView, + QWidget +) + +from calibre.gui2.widgets import BusyCursor + + +class VoicesModel(QAbstractTableModel): + + system_default_voice = '__default__' + + def __init__(self, voice_data, parent=None): + super().__init__(parent) + self.voice_data = voice_data + self.current_voices = tuple((x.display_name, x.language, x.gender, x.id) for x in voice_data) + self.column_headers = _('Name'), _('Language'), _('Gender') + + def rowCount(self, parent=None): + return len(self.current_voices) + 1 + + def columnCount(self, parent=None): + return len(self.column_headers) + + def headerData(self, section, orientation, role=Qt.ItemDataRole.DisplayRole): + if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal: + return self.column_headers[section] + return super().headerData(section, orientation, role) + + def data(self, index, role=Qt.ItemDataRole.DisplayRole): + if role == Qt.ItemDataRole.DisplayRole: + row = index.row() + with suppress(IndexError): + if row == 0: + return (_('System default'), '', '', '')[index.column()] + data = self.current_voices[row - 1] + col = index.column() + ans = data[col] or '' + return ans + if role == Qt.ItemDataRole.UserRole: + row = index.row() + with suppress(IndexError): + if row == 0: + return self.system_default_voice + return self.current_voices[row - 1][3] + + def index_for_voice(self, v): + r = 0 + if v != self.system_default_voice: + for i, x in enumerate(self.current_voices): + if x[3] == v: + r = i + 1 + break + else: + return + return self.index(r, 0) + + +class Widget(QWidget): + + def __init__(self, tts_client, initial_backend_settings=None, parent=None): + QWidget.__init__(self, parent) + self.l = l = QFormLayout(self) + self.tts_client = tts_client + + with BusyCursor(): + self.voice_data = self.tts_client.get_all_voices() + self.default_system_rate = self.tts_client.default_system_rate + self.all_sound_outputs = self.tts_client.get_all_audio_devices() + self.default_system_audio_device = self.tts_client.default_system_audio_device + + self.speed = s = QSlider(Qt.Orientation.Horizontal, self) + s.setMinimumWidth(200) + l.addRow(_('&Speed of speech:'), s) + s.setRange(int(self.tts_client.min_rate * 100), int(100 * self.tts_client.max_rate)) + s.setSingleStep(10) + s.setPageStep(40) + + self.voices = v = QTableView(self) + self.voices_model = VoicesModel(self.voice_data, parent=v) + self.proxy_model = p = QSortFilterProxyModel(self) + p.setFilterCaseSensitivity(Qt.CaseSensitivity.CaseInsensitive) + p.setSourceModel(self.voices_model) + v.setModel(p) + v.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows) + v.setSortingEnabled(True) + v.horizontalHeader().resizeSection(0, QFontMetrics(self.font()).averageCharWidth() * 25) + v.horizontalHeader().resizeSection(1, QFontMetrics(self.font()).averageCharWidth() * 30) + v.verticalHeader().close() + v.verticalHeader().close() + v.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection) + v.sortByColumn(0, Qt.SortOrder.AscendingOrder) + l.addRow(v) + + self.sound_outputs = so = QComboBox(self) + so.addItem(_('System default'), ()) + for x in self.all_sound_outputs: + so.addItem(x.name, x.spec()) + l.addRow(_('Sound output:'), so) + + self.backend_settings = initial_backend_settings or {} + + def restore_state(self, prefs): + data = prefs.get(f'{self.tts_client.name}-voice-table-state') + if data is not None: + self.voices.horizontalHeader().restoreState(QByteArray(data)) + + def save_state(self, prefs): + data = bytearray(self.voices.horizontalHeader().saveState()) + prefs.set(f'{self.tts_client.name}-voice-table-state', data) + + def restore_to_defaults(self): + self.backend_settings = {} + + def sizeHint(self): + ans = super().sizeHint() + ans.setHeight(max(ans.height(), 600)) + ans.setWidth(max(ans.width(), 500)) + return ans + + @property + def selected_voice(self): + for x in self.voices.selectedIndexes(): + return x.data(Qt.ItemDataRole.UserRole) + + @selected_voice.setter + def selected_voice(self, val): + val = val or VoicesModel.system_default_voice + idx = self.voices_model.index_for_voice(val) + if idx is not None: + idx = self.proxy_model.mapFromSource(idx) + self.voices.selectionModel().select(idx, QItemSelectionModel.SelectionFlag.ClearAndSelect | QItemSelectionModel.SelectionFlag.Rows) + self.voices.scrollTo(idx) + + @property + def rate(self): + return self.speed.value() / 100 + + @rate.setter + def rate(self, val): + val = int((val or self.default_system_rate) * 100) + self.speed.setValue(val) + + @property + def sound_output(self): + return self.sound_outputs.currentData() + + @sound_output.setter + def sound_output(self, val): + idx = 0 + if val: + q = self.sound_outputs.findData(val) + if q > -1: + idx = q + self.sound_outputs.setCurrentIndex(idx) + + @property + def backend_settings(self): + ans = {} + voice = self.selected_voice + if voice and voice != VoicesModel.system_default_voice: + ans['voice'] = voice + rate = self.rate + if rate and rate != self.default_system_rate: + ans['rate'] = rate + so = self.sound_output + if so: + ans['sound_output'] = so + return ans + + @backend_settings.setter + def backend_settings(self, val): + voice = val.get('voice') or VoicesModel.system_default_voice + self.selected_voice = voice + self.rate = val.get('rate', self.default_system_rate) + self.sound_output = val.get('sound_output') or () + + +def develop(): + from calibre.gui2 import Application + from calibre.gui2.tts.implementation import Client + app = Application([]) + c = Client() + w = Widget(c, {}) + w.show() + app.exec() + print(w.backend_settings) + + +if __name__ == '__main__': + develop() diff --git a/src/calibre/utils/windows/winspeech.cpp b/src/calibre/utils/windows/winspeech.cpp index 97d23d2534..b86020eb34 100644 --- a/src/calibre/utils/windows/winspeech.cpp +++ b/src/calibre/utils/windows/winspeech.cpp @@ -756,6 +756,9 @@ static const std::unordered_map handlers = { bool found = false; if (parts.size()) { auto voice_id = winrt::hstring(parts.at(0)); + if (voice_id == L"__default__") { + voice_id = SpeechSynthesizer::DefaultVoice().Id(); + } for (auto const &candidate : SpeechSynthesizer::AllVoices()) { if (candidate.Id() == voice_id) { speech_synthesizer.Voice(candidate); @@ -765,8 +768,8 @@ static const std::unordered_map handlers = { } } auto x = speech_synthesizer.Voice(); - if (x) output(cmd_id, "voice", {{"value", speech_synthesizer.Voice()}, {"found", found}}); - else output(cmd_id, "voice", {{"value", ""}, {"found", found}}); + if (x) output(cmd_id, "voice", {{"voice", speech_synthesizer.Voice()}, {"found", found}}); + else output(cmd_id, "voice", {{"voice", ""}, {"found", found}}); }}, {"volume", [](id_type cmd_id, std::vector parts, int64_t*) { diff --git a/src/calibre/utils/windows/winspeech.py b/src/calibre/utils/windows/winspeech.py index c7b65c4495..f9cfef41fe 100644 --- a/src/calibre/utils/windows/winspeech.py +++ b/src/calibre/utils/windows/winspeech.py @@ -12,7 +12,7 @@ from itertools import count from queue import Empty, Queue from threading import Thread from time import monotonic -from typing import NamedTuple, Tuple +from typing import NamedTuple, Tuple, Optional from calibre.constants import DEBUG from calibre.utils.ipc.simple_worker import start_pipe_worker @@ -101,11 +101,12 @@ class SpeechError(OSError): val += f'{msg}. ' val += err.msg + ': ' + err.error + f'\nFile: {err.file} Line: {err.line}' if err.hr: + # List of mediaserver errors is here: https://www.hresult.info/FACILITY_MEDIASERVER val += f' HRESULT: 0x{err.hr:x}' super().__init__(val) -class NoAudioDevices(Exception): +class NoAudioDevices(OSError): def __init__(self): super().__init__(_('No active audio output devices found.' ' Connect headphones or speakers. If you are using Remote Desktop then enable Remote Audio for it.')) @@ -212,7 +213,7 @@ class DefaultVoice(NamedTuple): class Voice(NamedTuple): related_to: int - voice: VoiceInformation + voice: Optional[VoiceInformation] found: bool = True @@ -223,13 +224,21 @@ class DeviceInformation(NamedTuple): is_default: bool is_enabled: bool + def spec(self) -> Tuple[str, str]: + return self.kind, self.id + class AudioDevice(NamedTuple): related_to: int - device: DeviceInformation + device: Optional[DeviceInformation] found: bool = True +class AllAudioDevices(NamedTuple): + related_to: int + devices: Tuple[DeviceInformation, ...] + + class AllVoices(NamedTuple): related_to: int voices: Tuple[VoiceInformation, ...] @@ -301,11 +310,18 @@ def parse_message(line): return AllVoices(**ans) if msg_type == 'all_audio_devices': ans['devices'] = tuple(DeviceInformation(**x) for x in ans['devices']) - return AudioDevice(**ans) + return AllAudioDevices(**ans) if msg_type == 'audio_device': + if ans['device']: + ans['device'] = DeviceInformation(ans['device']) + else: + ans['device'] = None return AudioDevice(**ans) if msg_type == 'voice': - ans['voice'] = VoiceInformation(**ans['voice']) + if ans['voice']: + ans['voice'] = VoiceInformation(**ans['voice']) + else: + ans['voice'] = None return Voice(**ans) if msg_type == 'volume': return Volume(**ans) @@ -357,7 +373,7 @@ class WinSpeech: line = line.strip() if DEBUG: with suppress(Exception): - print('winspeech:', line.decode('utf-8', 'replace'), flush=True) + print('winspeech:\x1b[32m<-\x1b[39m', line.decode('utf-8', 'replace'), flush=True) send_msg(parse_message(line)) except OSError as e: send_msg(Error('Failed to read from worker', str(e))) @@ -367,7 +383,11 @@ class WinSpeech: def send_command(self, cmd): cmd_id = next(self.msg_id_counter) w = self.worker - w.stdin.write(f'{cmd_id} {cmd}\n'.encode('utf-8')) + cmd = f'{cmd_id} {cmd}' + if DEBUG: + with suppress(Exception): + print('winspeech:\x1b[31m->\x1b[39m', cmd, flush=True) + w.stdin.write(f'{cmd}\n'.encode('utf-8')) w.stdin.flush() return cmd_id @@ -410,6 +430,38 @@ class WinSpeech: def play(self): self.wait_for('play', Play, related_to=self.send_command('play')) + def set_rate(self, val): + val = float(val) + self.wait_for('Setting the rate', Rate, related_to=self.send_command(f'rate {val}')) + + def set_voice(self, spec, default_system_voice): + val = spec or getattr(default_system_voice, 'id', '__default__') + x = self.wait_for('Setting the voice', Voice, related_to=self.send_command(f'voice {val}')) + if not x.found: + raise KeyError(f'Failed to find the voice: {val}') + + def set_audio_device(self, spec, default_system_audio_device): + if not spec and not default_system_audio_device: + return + if not spec: + spec = default_system_audio_device.spec() + x = self.wait_for('Setting the audio device', AudioDevice, related_to=self.send_command(f'audio_device {spec[0]} {spec[1]}')) + if not x.found: + raise KeyError(f'Failed to find the audio device: {spec}') + + def get_audio_device(self): + return self.wait_for('Audio device', AudioDevice, related_to=self.send_command('audio_device')) + + def default_voice(self): + return self.wait_for('Default voice', DefaultVoice, related_to=self.send_command('default_voice')) + + def all_voices(self): + return self.wait_for('All voices', AllVoices, related_to=self.send_command('all_voices')) + + def all_audio_devices(self): + return self.wait_for('All audio devices', AllAudioDevices, related_to=self.send_command('all_audio_devices')) + + # develop {{{ def develop_loop(*commands):