Port config to winspeech

This commit is contained in:
Kovid Goyal 2023-02-02 11:22:46 +05:30
parent f64b9e3e2c
commit c8e9f33736
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 321 additions and 18 deletions

View File

@ -52,8 +52,16 @@ class Client:
self.synthesizing = False self.synthesizing = False
self.settings = settings or {} self.settings = settings or {}
self.clear_chunks() self.clear_chunks()
self.default_system_audio_device = self.backend.get_audio_device().device
self.default_system_voice = self.backend.default_voice().voice
self.apply_settings() self.apply_settings()
def get_all_voices(self):
return self.backend.all_voices().voices
def get_all_audio_devices(self):
return self.backend.all_audio_devices().devices
def __del__(self): def __del__(self):
if self.backend is not None: if self.backend is not None:
self.backend.shutdown() self.backend.shutdown()
@ -63,6 +71,9 @@ class Client:
def dispatch_msg(self, msg): def dispatch_msg(self, msg):
self.dispatch_on_main_thread(partial(self.handle_event, msg)) self.dispatch_on_main_thread(partial(self.handle_event, msg))
def speak_current_chunk(self):
self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True)
def handle_event(self, x): def handle_event(self, x):
if isinstance(x, MarkReached) and self.current_chunks: if isinstance(x, MarkReached) and self.current_chunks:
self.last_mark = x.id self.last_mark = x.id
@ -74,7 +85,7 @@ class Client:
self.callback_ignoring_errors(Event(EventType.end)) self.callback_ignoring_errors(Event(EventType.end))
else: else:
self.current_chunk_idx += 1 self.current_chunk_idx += 1
self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True) self.speak_current_chunk()
elif x.state is MediaState.failed: elif x.state is MediaState.failed:
self.clear_chunks() self.clear_chunks()
self.callback_ignoring_errors(Event(EventType.cancel)) self.callback_ignoring_errors(Event(EventType.cancel))
@ -82,7 +93,8 @@ class Client:
e.display_to_user = True e.display_to_user = True
raise e raise e
elif x.state is MediaState.opened: elif x.state is MediaState.opened:
self.callback_ignoring_errors(Event(EventType.begin)) self.callback_ignoring_errors(Event(EventType.resume if self.next_start_is_resume else EventType.begin))
self.next_start_is_resume = False
elif isinstance(x, Error): elif isinstance(x, Error):
raise x.as_exception(check_for_no_audio_devices=True) raise x.as_exception(check_for_no_audio_devices=True)
else: else:
@ -98,12 +110,11 @@ class Client:
self.clear_chunks() self.clear_chunks()
self.current_callback = callback self.current_callback = callback
self.current_chunks = tuple(split_into_chunks(text, self.chunk_size)) self.current_chunks = tuple(split_into_chunks(text, self.chunk_size))
self.current_chunk_idx = 0 self.current_chunk_idx = -100
if self.current_chunks: if self.current_chunks:
self.backend.speak(self.current_chunks[self.current_chunk_idx], is_cued=True) self.current_chunk_idx = 0
self.speak_current_chunk()
self.synthesizing = True self.synthesizing = True
if self.current_callback is not None:
self.current_callback(Event(EventType.begin))
def callback_ignoring_errors(self, ev): def callback_ignoring_errors(self, ev):
if self.current_callback is not None: if self.current_callback is not None:
@ -115,8 +126,9 @@ class Client:
def clear_chunks(self): def clear_chunks(self):
self.synthesizing = False self.synthesizing = False
self.next_start_is_resume = False
self.current_chunk_idx = -100 self.current_chunk_idx = -100
self.current_chunks = [] self.current_chunks = ()
self.last_mark = -1 self.last_mark = -1
def stop(self): def stop(self):
@ -138,12 +150,52 @@ class Client:
self.current_callback(Event(EventType.resume)) self.current_callback(Event(EventType.resume))
def apply_settings(self, new_settings=None): def apply_settings(self, new_settings=None):
pass if self.synthesizing:
self.stop()
if new_settings is not None:
self.settings = new_settings
try:
self.backend.set_rate(self.settings.get('rate', self.default_system_rate))
except OSError:
self.settings.pop('rate', None)
try:
self.backend.set_voice(self.settings.get('voice'), self.default_system_voice)
except OSError:
self.settings.pop('voice', None)
try:
self.backend.set_audio_device(self.settings.get('sound_output'), self.default_system_audio_device)
except OSError:
self.settings.pop('sound_output', None)
def config_widget(self, backend_settings, parent): def config_widget(self, backend_settings, parent):
from calibre.gui2.tts.windows_config import Widget from calibre.gui2.tts.windows_config import Widget
return Widget(self, backend_settings, parent) return Widget(self, backend_settings, parent)
def chunks_from_last_mark(self):
for i, chunk in enumerate(self.current_chunks):
for ci, x in enumerate(chunk):
if x == self.last_mark:
chunks = self.current_chunks[i:]
chunk = chunk[ci + 1:]
if chunk:
chunks = (chunk,) + chunks[1:]
else:
chunks = chunks[1:]
return chunks
return ()
def resume_after_configure(self):
if not self.synthesizing:
return
self.current_chunk_idx = -100
self.last_mark = -1
self.current_chunks = self.chunks_from_last_mark()
self.next_start_is_resume = True
self.synthesizing = bool(self.current_chunks)
if self.current_chunks:
self.current_chunk_idx = 0
self.speak_current_chunk()
def change_rate(self, steps=1): def change_rate(self, steps=1):
rate = current_rate = self.settings.get('rate', self.default_system_rate) rate = current_rate = self.settings.get('rate', self.default_system_rate)
if rate < 1: if rate < 1:

View File

@ -0,0 +1,196 @@
#!/usr/bin/env python
# License: GPL v3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
from contextlib import suppress
from qt.core import (
QAbstractItemView, QAbstractTableModel, QByteArray, QComboBox, QFontMetrics,
QFormLayout, QItemSelectionModel, QSlider, QSortFilterProxyModel, Qt, QTableView,
QWidget
)
from calibre.gui2.widgets import BusyCursor
class VoicesModel(QAbstractTableModel):
system_default_voice = '__default__'
def __init__(self, voice_data, parent=None):
super().__init__(parent)
self.voice_data = voice_data
self.current_voices = tuple((x.display_name, x.language, x.gender, x.id) for x in voice_data)
self.column_headers = _('Name'), _('Language'), _('Gender')
def rowCount(self, parent=None):
return len(self.current_voices) + 1
def columnCount(self, parent=None):
return len(self.column_headers)
def headerData(self, section, orientation, role=Qt.ItemDataRole.DisplayRole):
if role == Qt.ItemDataRole.DisplayRole and orientation == Qt.Orientation.Horizontal:
return self.column_headers[section]
return super().headerData(section, orientation, role)
def data(self, index, role=Qt.ItemDataRole.DisplayRole):
if role == Qt.ItemDataRole.DisplayRole:
row = index.row()
with suppress(IndexError):
if row == 0:
return (_('System default'), '', '', '')[index.column()]
data = self.current_voices[row - 1]
col = index.column()
ans = data[col] or ''
return ans
if role == Qt.ItemDataRole.UserRole:
row = index.row()
with suppress(IndexError):
if row == 0:
return self.system_default_voice
return self.current_voices[row - 1][3]
def index_for_voice(self, v):
r = 0
if v != self.system_default_voice:
for i, x in enumerate(self.current_voices):
if x[3] == v:
r = i + 1
break
else:
return
return self.index(r, 0)
class Widget(QWidget):
def __init__(self, tts_client, initial_backend_settings=None, parent=None):
QWidget.__init__(self, parent)
self.l = l = QFormLayout(self)
self.tts_client = tts_client
with BusyCursor():
self.voice_data = self.tts_client.get_all_voices()
self.default_system_rate = self.tts_client.default_system_rate
self.all_sound_outputs = self.tts_client.get_all_audio_devices()
self.default_system_audio_device = self.tts_client.default_system_audio_device
self.speed = s = QSlider(Qt.Orientation.Horizontal, self)
s.setMinimumWidth(200)
l.addRow(_('&Speed of speech:'), s)
s.setRange(int(self.tts_client.min_rate * 100), int(100 * self.tts_client.max_rate))
s.setSingleStep(10)
s.setPageStep(40)
self.voices = v = QTableView(self)
self.voices_model = VoicesModel(self.voice_data, parent=v)
self.proxy_model = p = QSortFilterProxyModel(self)
p.setFilterCaseSensitivity(Qt.CaseSensitivity.CaseInsensitive)
p.setSourceModel(self.voices_model)
v.setModel(p)
v.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
v.setSortingEnabled(True)
v.horizontalHeader().resizeSection(0, QFontMetrics(self.font()).averageCharWidth() * 25)
v.horizontalHeader().resizeSection(1, QFontMetrics(self.font()).averageCharWidth() * 30)
v.verticalHeader().close()
v.verticalHeader().close()
v.setSelectionMode(QAbstractItemView.SelectionMode.SingleSelection)
v.sortByColumn(0, Qt.SortOrder.AscendingOrder)
l.addRow(v)
self.sound_outputs = so = QComboBox(self)
so.addItem(_('System default'), ())
for x in self.all_sound_outputs:
so.addItem(x.name, x.spec())
l.addRow(_('Sound output:'), so)
self.backend_settings = initial_backend_settings or {}
def restore_state(self, prefs):
data = prefs.get(f'{self.tts_client.name}-voice-table-state')
if data is not None:
self.voices.horizontalHeader().restoreState(QByteArray(data))
def save_state(self, prefs):
data = bytearray(self.voices.horizontalHeader().saveState())
prefs.set(f'{self.tts_client.name}-voice-table-state', data)
def restore_to_defaults(self):
self.backend_settings = {}
def sizeHint(self):
ans = super().sizeHint()
ans.setHeight(max(ans.height(), 600))
ans.setWidth(max(ans.width(), 500))
return ans
@property
def selected_voice(self):
for x in self.voices.selectedIndexes():
return x.data(Qt.ItemDataRole.UserRole)
@selected_voice.setter
def selected_voice(self, val):
val = val or VoicesModel.system_default_voice
idx = self.voices_model.index_for_voice(val)
if idx is not None:
idx = self.proxy_model.mapFromSource(idx)
self.voices.selectionModel().select(idx, QItemSelectionModel.SelectionFlag.ClearAndSelect | QItemSelectionModel.SelectionFlag.Rows)
self.voices.scrollTo(idx)
@property
def rate(self):
return self.speed.value() / 100
@rate.setter
def rate(self, val):
val = int((val or self.default_system_rate) * 100)
self.speed.setValue(val)
@property
def sound_output(self):
return self.sound_outputs.currentData()
@sound_output.setter
def sound_output(self, val):
idx = 0
if val:
q = self.sound_outputs.findData(val)
if q > -1:
idx = q
self.sound_outputs.setCurrentIndex(idx)
@property
def backend_settings(self):
ans = {}
voice = self.selected_voice
if voice and voice != VoicesModel.system_default_voice:
ans['voice'] = voice
rate = self.rate
if rate and rate != self.default_system_rate:
ans['rate'] = rate
so = self.sound_output
if so:
ans['sound_output'] = so
return ans
@backend_settings.setter
def backend_settings(self, val):
voice = val.get('voice') or VoicesModel.system_default_voice
self.selected_voice = voice
self.rate = val.get('rate', self.default_system_rate)
self.sound_output = val.get('sound_output') or ()
def develop():
from calibre.gui2 import Application
from calibre.gui2.tts.implementation import Client
app = Application([])
c = Client()
w = Widget(c, {})
w.show()
app.exec()
print(w.backend_settings)
if __name__ == '__main__':
develop()

View File

@ -756,6 +756,9 @@ static const std::unordered_map<std::string, handler_function> handlers = {
bool found = false; bool found = false;
if (parts.size()) { if (parts.size()) {
auto voice_id = winrt::hstring(parts.at(0)); auto voice_id = winrt::hstring(parts.at(0));
if (voice_id == L"__default__") {
voice_id = SpeechSynthesizer::DefaultVoice().Id();
}
for (auto const &candidate : SpeechSynthesizer::AllVoices()) { for (auto const &candidate : SpeechSynthesizer::AllVoices()) {
if (candidate.Id() == voice_id) { if (candidate.Id() == voice_id) {
speech_synthesizer.Voice(candidate); speech_synthesizer.Voice(candidate);
@ -765,8 +768,8 @@ static const std::unordered_map<std::string, handler_function> handlers = {
} }
} }
auto x = speech_synthesizer.Voice(); auto x = speech_synthesizer.Voice();
if (x) output(cmd_id, "voice", {{"value", speech_synthesizer.Voice()}, {"found", found}}); if (x) output(cmd_id, "voice", {{"voice", speech_synthesizer.Voice()}, {"found", found}});
else output(cmd_id, "voice", {{"value", ""}, {"found", found}}); else output(cmd_id, "voice", {{"voice", ""}, {"found", found}});
}}, }},
{"volume", [](id_type cmd_id, std::vector<std::wstring_view> parts, int64_t*) { {"volume", [](id_type cmd_id, std::vector<std::wstring_view> parts, int64_t*) {

View File

@ -12,7 +12,7 @@ from itertools import count
from queue import Empty, Queue from queue import Empty, Queue
from threading import Thread from threading import Thread
from time import monotonic from time import monotonic
from typing import NamedTuple, Tuple from typing import NamedTuple, Tuple, Optional
from calibre.constants import DEBUG from calibre.constants import DEBUG
from calibre.utils.ipc.simple_worker import start_pipe_worker from calibre.utils.ipc.simple_worker import start_pipe_worker
@ -101,11 +101,12 @@ class SpeechError(OSError):
val += f'{msg}. ' val += f'{msg}. '
val += err.msg + ': ' + err.error + f'\nFile: {err.file} Line: {err.line}' val += err.msg + ': ' + err.error + f'\nFile: {err.file} Line: {err.line}'
if err.hr: if err.hr:
# List of mediaserver errors is here: https://www.hresult.info/FACILITY_MEDIASERVER
val += f' HRESULT: 0x{err.hr:x}' val += f' HRESULT: 0x{err.hr:x}'
super().__init__(val) super().__init__(val)
class NoAudioDevices(Exception): class NoAudioDevices(OSError):
def __init__(self): def __init__(self):
super().__init__(_('No active audio output devices found.' super().__init__(_('No active audio output devices found.'
' Connect headphones or speakers. If you are using Remote Desktop then enable Remote Audio for it.')) ' Connect headphones or speakers. If you are using Remote Desktop then enable Remote Audio for it.'))
@ -212,7 +213,7 @@ class DefaultVoice(NamedTuple):
class Voice(NamedTuple): class Voice(NamedTuple):
related_to: int related_to: int
voice: VoiceInformation voice: Optional[VoiceInformation]
found: bool = True found: bool = True
@ -223,13 +224,21 @@ class DeviceInformation(NamedTuple):
is_default: bool is_default: bool
is_enabled: bool is_enabled: bool
def spec(self) -> Tuple[str, str]:
return self.kind, self.id
class AudioDevice(NamedTuple): class AudioDevice(NamedTuple):
related_to: int related_to: int
device: DeviceInformation device: Optional[DeviceInformation]
found: bool = True found: bool = True
class AllAudioDevices(NamedTuple):
related_to: int
devices: Tuple[DeviceInformation, ...]
class AllVoices(NamedTuple): class AllVoices(NamedTuple):
related_to: int related_to: int
voices: Tuple[VoiceInformation, ...] voices: Tuple[VoiceInformation, ...]
@ -301,11 +310,18 @@ def parse_message(line):
return AllVoices(**ans) return AllVoices(**ans)
if msg_type == 'all_audio_devices': if msg_type == 'all_audio_devices':
ans['devices'] = tuple(DeviceInformation(**x) for x in ans['devices']) ans['devices'] = tuple(DeviceInformation(**x) for x in ans['devices'])
return AudioDevice(**ans) return AllAudioDevices(**ans)
if msg_type == 'audio_device': if msg_type == 'audio_device':
if ans['device']:
ans['device'] = DeviceInformation(ans['device'])
else:
ans['device'] = None
return AudioDevice(**ans) return AudioDevice(**ans)
if msg_type == 'voice': if msg_type == 'voice':
ans['voice'] = VoiceInformation(**ans['voice']) if ans['voice']:
ans['voice'] = VoiceInformation(**ans['voice'])
else:
ans['voice'] = None
return Voice(**ans) return Voice(**ans)
if msg_type == 'volume': if msg_type == 'volume':
return Volume(**ans) return Volume(**ans)
@ -357,7 +373,7 @@ class WinSpeech:
line = line.strip() line = line.strip()
if DEBUG: if DEBUG:
with suppress(Exception): with suppress(Exception):
print('winspeech:', line.decode('utf-8', 'replace'), flush=True) print('winspeech:\x1b[32m<-\x1b[39m', line.decode('utf-8', 'replace'), flush=True)
send_msg(parse_message(line)) send_msg(parse_message(line))
except OSError as e: except OSError as e:
send_msg(Error('Failed to read from worker', str(e))) send_msg(Error('Failed to read from worker', str(e)))
@ -367,7 +383,11 @@ class WinSpeech:
def send_command(self, cmd): def send_command(self, cmd):
cmd_id = next(self.msg_id_counter) cmd_id = next(self.msg_id_counter)
w = self.worker w = self.worker
w.stdin.write(f'{cmd_id} {cmd}\n'.encode('utf-8')) cmd = f'{cmd_id} {cmd}'
if DEBUG:
with suppress(Exception):
print('winspeech:\x1b[31m->\x1b[39m', cmd, flush=True)
w.stdin.write(f'{cmd}\n'.encode('utf-8'))
w.stdin.flush() w.stdin.flush()
return cmd_id return cmd_id
@ -410,6 +430,38 @@ class WinSpeech:
def play(self): def play(self):
self.wait_for('play', Play, related_to=self.send_command('play')) self.wait_for('play', Play, related_to=self.send_command('play'))
def set_rate(self, val):
val = float(val)
self.wait_for('Setting the rate', Rate, related_to=self.send_command(f'rate {val}'))
def set_voice(self, spec, default_system_voice):
val = spec or getattr(default_system_voice, 'id', '__default__')
x = self.wait_for('Setting the voice', Voice, related_to=self.send_command(f'voice {val}'))
if not x.found:
raise KeyError(f'Failed to find the voice: {val}')
def set_audio_device(self, spec, default_system_audio_device):
if not spec and not default_system_audio_device:
return
if not spec:
spec = default_system_audio_device.spec()
x = self.wait_for('Setting the audio device', AudioDevice, related_to=self.send_command(f'audio_device {spec[0]} {spec[1]}'))
if not x.found:
raise KeyError(f'Failed to find the audio device: {spec}')
def get_audio_device(self):
return self.wait_for('Audio device', AudioDevice, related_to=self.send_command('audio_device'))
def default_voice(self):
return self.wait_for('Default voice', DefaultVoice, related_to=self.send_command('default_voice'))
def all_voices(self):
return self.wait_for('All voices', AllVoices, related_to=self.send_command('all_voices'))
def all_audio_devices(self):
return self.wait_for('All audio devices', AllAudioDevices, related_to=self.send_command('all_audio_devices'))
# develop {{{ # develop {{{
def develop_loop(*commands): def develop_loop(*commands):