Content server: Viewer: Read Aloud: Implement word-by-word tracking when reading aloud if the browser and voice used support it.

This commit is contained in:
Kovid Goyal 2024-08-25 15:31:24 +05:30
parent f3299f5b8f
commit b51055a00f
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 106 additions and 87 deletions

View File

@ -115,7 +115,7 @@ class TTS(QObject):
def callback(self, event):
data = event.data
if event.type is event.type.mark:
data = int(data)
data = {'first': int(data), 'last': int(data)}
self.event_received.emit(event.type.name, data)
def stop(self, data):

View File

@ -162,17 +162,24 @@ def select_search_result(sr):
return select_find_result(match)
def select_tts_mark(idx_in_flat_text):
window.getSelection().removeAllRanges()
if not cache.text_map:
cache.text_map = build_text_map()
def find_word_length(idx):
r = tts_word_regex()
r.lastIndex = idx_in_flat_text
r.lastIndex = idx
match = v'r.exec(cache.text_map.flat_text)'
word_length = 5
if match:
word_length = match[0]?.length or 5
match = get_occurrence_data(cache.text_map.node_list, idx_in_flat_text, idx_in_flat_text + word_length)
return word_length
def select_tts_mark(idx_in_flat_text, last_idx_in_flat_text):
window.getSelection().removeAllRanges()
if not cache.text_map:
cache.text_map = build_text_map()
if idx_in_flat_text is last_idx_in_flat_text:
match = get_occurrence_data(cache.text_map.node_list, idx_in_flat_text, idx_in_flat_text + find_word_length(idx_in_flat_text))
else:
match = get_occurrence_data(cache.text_map.node_list, idx_in_flat_text, last_idx_in_flat_text + find_word_length(last_idx_in_flat_text))
if not match:
return False
return select_find_result(match)

View File

@ -1041,9 +1041,11 @@ class IframeBoss:
sel.removeAllRanges()
self.send_message('tts', type='text-extracted', marked_text=marked_text, pos=data.pos)
def mark_word_being_spoken(self, occurrence_number):
def mark_word_being_spoken(self, x):
if jstype(x) is 'number':
x = {'first': x, 'last': x}
self.last_search_at = window.performance.now()
if select_tts_mark(occurrence_number):
if select_tts_mark(x.first, x.last):
self.ensure_selection_boundary_visible()
def audio_ebook_msg_received(self, data):

View File

@ -11,15 +11,66 @@ from modals import create_custom_dialog, error_dialog
from widgets import create_button
def escaper():
doc = document.implementation.createDocument("", "", None)
el = doc.createElement("temp")
el.textContent = "temp"
el = el.firstChild
ser = new XMLSerializer() # noqa
return def(text):
el.nodeValue = text
return ser.serializeToString(el)
class Tracker:
def __init__(self):
self.clear()
def clear(self):
self.positions = v'[]'
self.last_pos = 0
self.queue = v'[]'
def parse_marked_text(self, marked_text):
self.clear()
text = v'[]'
text_len = chunk_len = index_in_positions = 0
limit = 4096
for x in marked_text:
if jstype(x) is 'number':
self.positions.push({'mark': x, 'offset_in_text': text_len})
else:
text_len += x.length
chunk_len += x.length
text.push(x)
if chunk_len > limit:
self.queue.push({'text': ''.join(text), 'index_in_positions': index_in_positions})
chunk_len = 0
text = v'[]'
index_in_positions = self.positions.length - 1
if text.length:
self.queue.push({'text': ''.join(text), 'index_in_positions': index_in_positions})
self.marked_text = marked_text
console.log(self.queue)
return self.current_text()
def pop_first(self):
self.queue.splice(0, 1)
def current_text(self):
if self.queue.length:
return self.queue[0].text
return ''
def resume(self):
self.last_pos = 0
if self.queue.length:
self.last_pos = self.queue[0].index_in_positions
return self.current_text()
def mark_word(self, start, length):
end = start + length
matches = v'[]'
while self.last_pos < self.positions.length:
pos = self.positions[self.last_pos]
if start <= pos.offset_in_text < end:
matches.push(pos)
elif pos.offset_in_text >= end:
break
self.last_pos += 1
if matches.length:
return matches[0].mark, matches[-1].mark
return None
class Client:
@ -30,7 +81,7 @@ class Client:
def __init__(self):
self.stop_requested_at = None
self.status = {'synthesizing': False, 'paused': False}
self.queue = v'[]'
self.tracker = Tracker(v'[]')
self.last_reached_mark = None
self.onevent = def():
pass
@ -38,19 +89,14 @@ class Client:
self.current_voice_uri = data.voice or ''
self.current_rate = data.rate or None
def create_utterance(self, text_or_ssml, wrap_in_ssml):
if wrap_in_ssml:
text_or_ssml = (
'<?xml version="1.0"?>\n<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"><s>' +
text_or_ssml +
'</s></speak>')
ut = new window.SpeechSynthesisUtterance(text_or_ssml)
def create_utterance(self, text):
ut = new window.SpeechSynthesisUtterance(text)
ut.onstart = self.utterance_started
ut.onpause = self.utterance_paused
ut.onend = self.utterance_ended
ut.onerror = self.utterance_failed
ut.onmark = self.utterance_mark_reached
ut.onresume = self.utterance_resumed
ut.addEventListener('boundary', self.utterance_boundary_reached)
if self.current_voice_uri:
for voice in window.speechSynthesis.getVoices():
if voice.voiceURI is self.current_voice_uri:
@ -58,7 +104,6 @@ class Client:
break
if self.current_rate:
ut.rate = self.current_rate
self.queue.push(ut)
return ut
def utterance_started(self, event):
@ -74,22 +119,27 @@ class Client:
if self.stop_requested_at? and window.performance.now() - self.stop_requested_at < 1000:
self.stop_requested_at = None
return
self.queue.splice(0, 1)
if self.queue.length:
window.speechSynthesis.speak(self.queue[0])
self.tracker.pop_first()
text = self.tracker.current_text()
if text and text.length:
window.speechSynthesis.speak(text)
else:
self.onevent('end')
def utterance_failed(self, event):
self.status = {'synthesizing': False, 'paused': False}
self.queue = v'[]'
self.tracker.clear()
if event.error is not 'interrupted':
error_dialog(_('Speaking failed'), _(
'An error has occurred with speech synthesis: ' + event.error))
self.onevent('cancel')
def utterance_mark_reached(self, event):
self.last_reached_mark = event.name
self.onevent('mark', int(event.name))
def utterance_boundary_reached(self, event):
if event.name is 'word':
x = self.tracker.mark_word(event.charIndex, event.charLength)
if x:
first, last = x[0], x[1]
self.onevent('mark', {'first': first, 'last': last})
def utterance_resumed(self, event):
self.status = {'synthesizing': True, 'paused': False}
@ -102,61 +152,28 @@ class Client:
window.speechSynthesis.resume()
def resume_after_configure(self):
if self.queue.length:
window.speechSynthesis.speak(self.queue[0])
text = self.tracker.resume()
if text and text.length:
window.speechSynthesis.speak(text)
def stop(self):
self.queue = v'[]'
self.tracker.clear()
self.stop_requested_at = window.performance.now()
window.speechSynthesis.cancel()
self.status = {'synthesizing': False, 'paused': False}
def speak_simple_text(self, text):
self.stop()
while text.length > 32766:
self.create_utterance(text[:32766])
text = text[32766:]
if text:
self.create_utterance(text)
if self.queue.length:
window.speechSynthesis.speak(self.queue[0])
text = self.tracker.parse_marked_text(v'[text]')
if text and text.length:
window.speechSynthesis.speak(self.create_utterance(text))
def speak_marked_text(self, text_segments, onevent):
self.stop()
self.onevent = onevent
buf = v'[]'
size = 0
limit = 2048
def commit():
nonlocal buf, size
text = buf.join('')
if text.length:
self.create_utterance(text)
buf = v'[]'
size = 0
for x in text_segments:
if jstype(x) is 'number':
# Currently the sad sack browsers dont support SSML
# https://github.com/WICG/speech-api/issues/37
# buf.push()
# markup = '<mark name="' + x + '"/>'
continue
else:
if x.length > limit:
commit()
while x.length:
self.create_utterance(x[:limit])
x = x[limit:]
continue
if size + x.length > limit:
commit()
buf.push(x)
size += x.length
commit()
if self.queue.length:
window.speechSynthesis.speak(self.queue[0])
text = self.tracker.parse_marked_text(text_segments)
if text and text.length:
window.speechSynthesis.speak(self.create_utterance(text))
def faster(self):
self.change_rate(steps=1)
@ -167,13 +184,6 @@ class Client:
def apply_settings(self):
sd = get_session_data()
sd.set('tts_backend', {'voice': self.current_voice_uri, 'rate': self.current_rate})
existing = self.queue
if self.queue and self.queue.length:
if self.status.paused:
window.speechSynthesis.resume()
self.stop()
for ut in existing:
self.create_utterance(ut.text)
def change_rate(self, steps=1):
rate = current_rate = (self.current_rate or 1) * 10