From b51055a00ff18530658129068f3f039863d0ff4f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 25 Aug 2024 15:31:24 +0530 Subject: [PATCH] Content server: Viewer: Read Aloud: Implement word-by-word tracking when reading aloud if the browser and voice used support it. --- src/calibre/gui2/viewer/tts.py | 2 +- src/pyj/read_book/find.pyj | 19 ++-- src/pyj/read_book/iframe.pyj | 6 +- src/pyj/read_book/tts.pyj | 166 +++++++++++++++++---------------- 4 files changed, 106 insertions(+), 87 deletions(-) diff --git a/src/calibre/gui2/viewer/tts.py b/src/calibre/gui2/viewer/tts.py index 31e7174be8..bba47ce8ac 100644 --- a/src/calibre/gui2/viewer/tts.py +++ b/src/calibre/gui2/viewer/tts.py @@ -115,7 +115,7 @@ class TTS(QObject): def callback(self, event): data = event.data if event.type is event.type.mark: - data = int(data) + data = {'first': int(data), 'last': int(data)} self.event_received.emit(event.type.name, data) def stop(self, data): diff --git a/src/pyj/read_book/find.pyj b/src/pyj/read_book/find.pyj index b25c9b83a1..72571a4318 100644 --- a/src/pyj/read_book/find.pyj +++ b/src/pyj/read_book/find.pyj @@ -162,17 +162,24 @@ def select_search_result(sr): return select_find_result(match) -def select_tts_mark(idx_in_flat_text): - window.getSelection().removeAllRanges() - if not cache.text_map: - cache.text_map = build_text_map() +def find_word_length(idx): r = tts_word_regex() - r.lastIndex = idx_in_flat_text + r.lastIndex = idx match = v'r.exec(cache.text_map.flat_text)' word_length = 5 if match: word_length = match[0]?.length or 5 - match = get_occurrence_data(cache.text_map.node_list, idx_in_flat_text, idx_in_flat_text + word_length) + return word_length + + +def select_tts_mark(idx_in_flat_text, last_idx_in_flat_text): + window.getSelection().removeAllRanges() + if not cache.text_map: + cache.text_map = build_text_map() + if idx_in_flat_text is last_idx_in_flat_text: + match = get_occurrence_data(cache.text_map.node_list, idx_in_flat_text, idx_in_flat_text + find_word_length(idx_in_flat_text)) + else: + match = get_occurrence_data(cache.text_map.node_list, idx_in_flat_text, last_idx_in_flat_text + find_word_length(last_idx_in_flat_text)) if not match: return False return select_find_result(match) diff --git a/src/pyj/read_book/iframe.pyj b/src/pyj/read_book/iframe.pyj index aa67d4a430..01ed1e776d 100644 --- a/src/pyj/read_book/iframe.pyj +++ b/src/pyj/read_book/iframe.pyj @@ -1041,9 +1041,11 @@ class IframeBoss: sel.removeAllRanges() self.send_message('tts', type='text-extracted', marked_text=marked_text, pos=data.pos) - def mark_word_being_spoken(self, occurrence_number): + def mark_word_being_spoken(self, x): + if jstype(x) is 'number': + x = {'first': x, 'last': x} self.last_search_at = window.performance.now() - if select_tts_mark(occurrence_number): + if select_tts_mark(x.first, x.last): self.ensure_selection_boundary_visible() def audio_ebook_msg_received(self, data): diff --git a/src/pyj/read_book/tts.pyj b/src/pyj/read_book/tts.pyj index eaaa4741ac..0813a78142 100644 --- a/src/pyj/read_book/tts.pyj +++ b/src/pyj/read_book/tts.pyj @@ -11,15 +11,66 @@ from modals import create_custom_dialog, error_dialog from widgets import create_button -def escaper(): - doc = document.implementation.createDocument("", "", None) - el = doc.createElement("temp") - el.textContent = "temp" - el = el.firstChild - ser = new XMLSerializer() # noqa - return def(text): - el.nodeValue = text - return ser.serializeToString(el) +class Tracker: + + def __init__(self): + self.clear() + + def clear(self): + self.positions = v'[]' + self.last_pos = 0 + self.queue = v'[]' + + def parse_marked_text(self, marked_text): + self.clear() + text = v'[]' + text_len = chunk_len = index_in_positions = 0 + limit = 4096 + for x in marked_text: + if jstype(x) is 'number': + self.positions.push({'mark': x, 'offset_in_text': text_len}) + else: + text_len += x.length + chunk_len += x.length + text.push(x) + if chunk_len > limit: + self.queue.push({'text': ''.join(text), 'index_in_positions': index_in_positions}) + chunk_len = 0 + text = v'[]' + index_in_positions = self.positions.length - 1 + if text.length: + self.queue.push({'text': ''.join(text), 'index_in_positions': index_in_positions}) + self.marked_text = marked_text + console.log(self.queue) + return self.current_text() + + def pop_first(self): + self.queue.splice(0, 1) + + def current_text(self): + if self.queue.length: + return self.queue[0].text + return '' + + def resume(self): + self.last_pos = 0 + if self.queue.length: + self.last_pos = self.queue[0].index_in_positions + return self.current_text() + + def mark_word(self, start, length): + end = start + length + matches = v'[]' + while self.last_pos < self.positions.length: + pos = self.positions[self.last_pos] + if start <= pos.offset_in_text < end: + matches.push(pos) + elif pos.offset_in_text >= end: + break + self.last_pos += 1 + if matches.length: + return matches[0].mark, matches[-1].mark + return None class Client: @@ -30,7 +81,7 @@ class Client: def __init__(self): self.stop_requested_at = None self.status = {'synthesizing': False, 'paused': False} - self.queue = v'[]' + self.tracker = Tracker(v'[]') self.last_reached_mark = None self.onevent = def(): pass @@ -38,19 +89,14 @@ class Client: self.current_voice_uri = data.voice or '' self.current_rate = data.rate or None - def create_utterance(self, text_or_ssml, wrap_in_ssml): - if wrap_in_ssml: - text_or_ssml = ( - '\n' + - text_or_ssml + - '') - ut = new window.SpeechSynthesisUtterance(text_or_ssml) + def create_utterance(self, text): + ut = new window.SpeechSynthesisUtterance(text) ut.onstart = self.utterance_started ut.onpause = self.utterance_paused ut.onend = self.utterance_ended ut.onerror = self.utterance_failed - ut.onmark = self.utterance_mark_reached ut.onresume = self.utterance_resumed + ut.addEventListener('boundary', self.utterance_boundary_reached) if self.current_voice_uri: for voice in window.speechSynthesis.getVoices(): if voice.voiceURI is self.current_voice_uri: @@ -58,7 +104,6 @@ class Client: break if self.current_rate: ut.rate = self.current_rate - self.queue.push(ut) return ut def utterance_started(self, event): @@ -74,22 +119,27 @@ class Client: if self.stop_requested_at? and window.performance.now() - self.stop_requested_at < 1000: self.stop_requested_at = None return - self.queue.splice(0, 1) - if self.queue.length: - window.speechSynthesis.speak(self.queue[0]) + self.tracker.pop_first() + text = self.tracker.current_text() + if text and text.length: + window.speechSynthesis.speak(text) else: self.onevent('end') def utterance_failed(self, event): self.status = {'synthesizing': False, 'paused': False} - self.queue = v'[]' - error_dialog(_('Speaking failed'), _( - 'An error has occurred with speech synthesis: ' + event.error)) + self.tracker.clear() + if event.error is not 'interrupted': + error_dialog(_('Speaking failed'), _( + 'An error has occurred with speech synthesis: ' + event.error)) self.onevent('cancel') - def utterance_mark_reached(self, event): - self.last_reached_mark = event.name - self.onevent('mark', int(event.name)) + def utterance_boundary_reached(self, event): + if event.name is 'word': + x = self.tracker.mark_word(event.charIndex, event.charLength) + if x: + first, last = x[0], x[1] + self.onevent('mark', {'first': first, 'last': last}) def utterance_resumed(self, event): self.status = {'synthesizing': True, 'paused': False} @@ -102,61 +152,28 @@ class Client: window.speechSynthesis.resume() def resume_after_configure(self): - if self.queue.length: - window.speechSynthesis.speak(self.queue[0]) + text = self.tracker.resume() + if text and text.length: + window.speechSynthesis.speak(text) def stop(self): - self.queue = v'[]' + self.tracker.clear() self.stop_requested_at = window.performance.now() window.speechSynthesis.cancel() self.status = {'synthesizing': False, 'paused': False} def speak_simple_text(self, text): self.stop() - while text.length > 32766: - self.create_utterance(text[:32766]) - text = text[32766:] - if text: - self.create_utterance(text) - if self.queue.length: - window.speechSynthesis.speak(self.queue[0]) + text = self.tracker.parse_marked_text(v'[text]') + if text and text.length: + window.speechSynthesis.speak(self.create_utterance(text)) def speak_marked_text(self, text_segments, onevent): self.stop() self.onevent = onevent - buf = v'[]' - size = 0 - limit = 2048 - - def commit(): - nonlocal buf, size - text = buf.join('') - if text.length: - self.create_utterance(text) - buf = v'[]' - size = 0 - - for x in text_segments: - if jstype(x) is 'number': - # Currently the sad sack browsers dont support SSML - # https://github.com/WICG/speech-api/issues/37 - # buf.push() - # markup = '' - continue - else: - if x.length > limit: - commit() - while x.length: - self.create_utterance(x[:limit]) - x = x[limit:] - continue - if size + x.length > limit: - commit() - buf.push(x) - size += x.length - commit() - if self.queue.length: - window.speechSynthesis.speak(self.queue[0]) + text = self.tracker.parse_marked_text(text_segments) + if text and text.length: + window.speechSynthesis.speak(self.create_utterance(text)) def faster(self): self.change_rate(steps=1) @@ -167,13 +184,6 @@ class Client: def apply_settings(self): sd = get_session_data() sd.set('tts_backend', {'voice': self.current_voice_uri, 'rate': self.current_rate}) - existing = self.queue - if self.queue and self.queue.length: - if self.status.paused: - window.speechSynthesis.resume() - self.stop() - for ut in existing: - self.create_utterance(ut.text) def change_rate(self, steps=1): rate = current_rate = (self.current_rate or 1) * 10