TTS: Fix word segmentation not working with non-ascii chars because JavaScript's regex engine has a non-unicode aware \w operator

2025-07-09 03:04:10 -04:00 · 2020-12-10 22:43:56 +05:30 · 2020-12-10 22:43:56 +05:30 · eedd6c7751
commit eedd6c7751
parent 08d7840d21
2 changed files with 6 additions and 4 deletions
--- a/src/calibre/gui2/tts/windows.py
+++ b/src/calibre/gui2/tts/windows.py
@ -99,8 +99,6 @@ class Client:
        from calibre_extensions.winsapi import (
            SPF_ASYNC, SPF_IS_NOT_XML, SPF_PURGEBEFORESPEAK, SPF_IS_XML
        )
-        import unicodedata
-        text = unicodedata.normalize('NFC', text)
        flags = SPF_IS_XML if is_xml else SPF_IS_NOT_XML
        self.current_stream_number = self.sp_voice.speak(text, flags | SPF_PURGEBEFORESPEAK | SPF_ASYNC, want_events)
        return self.current_stream_number
--- a/src/pyj/read_book/find.pyj
+++ b/src/pyj/read_book/find.pyj
@ -44,6 +44,10 @@ def index_for_node(node, node_list):
            return entry.offset


+def tts_word_regex():
+    return /[\p{L}\p{M}]+/gu
+
+
 def tts_data(text_node, offset):
    offset_in_flat_text = offset or 0
    if not cache.text_map:
@ -55,7 +59,7 @@ def tts_data(text_node, offset):
    last = None
    marked_text = v'[]'
    text = cache.text_map.flat_text[offset_in_flat_text:]
-    for v'match of text.matchAll(/\w+/g)':
+    for v'match of text.matchAll(tts_word_regex())':
        start = match.index
        if first:
            first = False
@ -162,7 +166,7 @@ def select_tts_mark(idx_in_flat_text):
    window.getSelection().removeAllRanges()
    if not cache.text_map:
        cache.text_map = build_text_map()
-    r = /\w+/g
+    r = tts_word_regex()
    r.lastIndex = idx_in_flat_text
    match = v'r.exec(cache.text_map.flat_text)'
    word_length = 5