Get basic TTS playback working in browser viewer

Sadly, there is no support for marking words, so highlighting the currently spoken word will require gymnastics.
2025-07-07 10:14:46 -04:00 · 2020-12-03 19:56:08 +05:30 · 2020-12-03 19:56:08 +05:30 · ce0a57b69e
commit ce0a57b69e
parent 5c9e597fb3
3 changed files with 148 additions and 8 deletions
--- a/src/pyj/read_book/read_aloud.pyj
+++ b/src/pyj/read_book/read_aloud.pyj
@ -173,6 +173,8 @@ class ReadAloud:
            self.send_message('mark', num=data)
        elif which is 'begin':
            self.state = PLAYING
+        elif which is 'end':
+            pass

    def send_message(self, type, **kw):
        self.view.iframe_wrapper.send_message('tts', type=type, **kw)
--- a/src/pyj/read_book/tts.pyj
+++ b/src/pyj/read_book/tts.pyj
@ -0,0 +1,120 @@
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
+from __python__ import bound_methods, hash_literals
+
+from gettext import gettext as _
+from modals import error_dialog
+
+
+def escaper():
+    doc = document.implementation.createDocument("", "", None)
+    el = doc.createElement("temp")
+    el.textContent = "temp"
+    el = el.firstChild
+    ser =  new XMLSerializer()  # noqa
+    return def(text):
+        el.nodeValue = text
+        return ser.serializeToString(el)
+escape_for_xml = escaper()
+
+
+class Client:
+
+    def __init__(self):
+        self.status = {'synthesizing': False, 'paused': False}
+        self.queue = v'[]'
+        self.last_reached_mark = None
+        self.onevent = def():
+            pass
+
+    def create_utterance(self, text_or_ssml, wrap_in_ssml):
+        if wrap_in_ssml:
+            text_or_ssml = (
+                '<?xml version="1.0"?>\n<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"><s>' +
+                text_or_ssml +
+                '</s></speak>')
+        ut = new window.SpeechSynthesisUtterance(text_or_ssml)
+        ut.onstart = self.utterance_started
+        ut.onpause = self.utterance_paused
+        ut.onend = self.utterance_ended
+        ut.onerror = self.utterance_failed
+        ut.onmark = self.utterance_mark_reached
+        ut.onresume = self.utterance_resumed
+        self.queue.push(ut)
+        return ut
+
+    def utterance_started(self, event):
+        self.status = {'synthesizing': True, 'paused': False}
+        self.onevent('begin')
+
+    def utterance_paused(self, event):
+        self.status = {'synthesizing': True, 'paused': True}
+        self.onevent('pause')
+
+    def utterance_ended(self, event):
+        self.status = {'synthesizing': False, 'paused': False}
+        self.queue.splice(0, 1)
+        if self.queue.length:
+            window.speechSynthesis.speak(self.queue[0])
+        else:
+            self.onevent('end')
+
+    def utterance_failed(self, event):
+        self.status = {'synthesizing': False, 'paused': False}
+        self.queue = v'[]'
+        error_dialog(_('Speaking failed'), _(
+            'An error has occurred with speech synthesis: ' + event.error))
+        self.onevent('cancel')
+
+    def utterance_mark_reached(self, event):
+        self.last_reached_mark = event.name
+        self.onevent('mark', int(event.name))
+
+    def utterance_resumed(self, event):
+        self.status = {'synthesizing': True, 'paused': False}
+        self.onevent('resume')
+
+    def pause(self):
+        window.speechSynthesis.pause()
+
+    def resume(self):
+        window.speechSynthesis.resume()
+
+    def stop(self):
+        window.speechSynthesis.cancel()
+        self.queue = v'[]'
+        self.status = {'synthesizing': False, 'paused': False}
+
+    def speak_simple_text(self, text):
+        self.stop()
+        while text.length > 32766:
+            self.create_utterance(text[:32766])
+            text = text[32766:]
+        if text:
+            self.create_utterance(text)
+        if self.queue.length:
+            window.speechSynthesis.speak(self.queue[0])
+
+    def speak_marked_text(self, text_segments, onevent):
+        self.stop()
+        self.onevent = onevent
+        buf = v'[]'
+        size = 0
+        for x in text_segments:
+            if jstype(x) is 'number':
+                # Currently the sad sack brosers dont support SSML
+                # https://github.com/WICG/speech-api/issues/37
+                # buf.push('<mark name="' + x + '"/>')
+                buf.push('')
+            else:
+                buf.push(escape_for_xml(x))
+            size += buf[-1].length
+            if size > 24000:
+                buf = v'[]'
+                size = 0
+                self.create_utterance(buf.join(''), True)
+        text = buf.join('')
+        if text.length:
+            self.create_utterance(text)
+        if self.queue.length:
+            window.speechSynthesis.speak(self.queue[0])
--- a/src/pyj/read_book/ui.pyj
+++ b/src/pyj/read_book/ui.pyj
@ -3,19 +3,20 @@
 # globals: __RENDER_VERSION__
 from __python__ import hash_literals

-import traceback
 from elementmaker import E
-from gettext import gettext as _

+import traceback
 from ajax import ajax, ajax_send
 from book_list.constants import read_book_container_id
 from book_list.library_data import current_library_id, library_data
 from book_list.router import home, push_state, read_book_mode, update_window_title
 from book_list.ui import show_panel
 from dom import clear
+from gettext import gettext as _
 from modals import create_simple_dialog_markup, error_dialog
 from read_book.db import get_db
 from read_book.globals import ui_operations
+from read_book.tts import Client
 from read_book.view import View
 from utils import debounce, full_screen_element, human_readable, request_full_screen
 from widgets import create_button
@ -53,6 +54,7 @@ class ReadUI:
            id=self.display_id, style='display:none',
        ))
        self.view = View(container.lastChild)
+        self.tts_client = Client()
        self.windows_to_listen_for_messages_from = []
        window.addEventListener('resize', debounce(self.on_resize.bind(self), 250))
        window.addEventListener('message', self.message_from_other_window.bind(self))
@ -83,6 +85,8 @@ class ReadUI:
        ui_operations.close_book = self.close_book.bind(self)
        ui_operations.copy_image = self.copy_image.bind(self)
        ui_operations.view_image = self.view_image.bind(self)
+        ui_operations.speak_simple_text = self.speak_simple_text.bind(self)
+        ui_operations.tts = self.tts.bind(self)
        ui_operations.open_url = def(url):
            window.open(url, '_blank')
        ui_operations.copy_selection = def(text, html):
@ -115,12 +119,6 @@ class ReadUI:
            window.navigator.clipboard.writeText(text or '').then(def (): pass;, def():
                error_dialog(_('Could not copy to clipboard'), _('No permission to write to clipboard'))
            )
-        ui_operations.speak_simple_text = def (text):
-            if not window.speechSynthesis:
-                return error_dialog(_('No speech support'), _(
-                    'Your browser does not have support for Text-to-Speech'))
-            ut = new SpeechSynthesisUtterance(text)  # noqa
-            window.speechSynthesis.speak(ut)

    def on_resize(self):
        self.view.on_resize()
@ -610,3 +608,23 @@ class ReadUI:
            w, callback = x
            if w is msg.source:
                callback(msg)
+
+    def check_for_speech_capability(self):
+        if not window.speechSynthesis:
+            error_dialog(_('No speech support'), _(
+                'Your browser does not have support for Text-to-Speech'))
+            return False
+        return True
+
+    def speak_simple_text(self, text):
+        if not self.check_for_speech_capability():
+            return
+        self.tts_client.speak_simple_text(text)
+
+    def tts(self, event, data):
+        if not self.check_for_speech_capability():
+            return
+        if event is 'play':
+            self.tts_client.speak_marked_text(data.marked_text, self.view.read_aloud.handle_tts_event)
+        else:
+            getattr(self.tts_client, event)()