TTS: Fix word segmentation not working with non-ascii chars because JavaScript's regex engine has a non-unicode aware \w operator

This commit is contained in:
Kovid Goyal 2020-12-10 22:43:56 +05:30
parent 08d7840d21
commit eedd6c7751
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 6 additions and 4 deletions

View File

@ -99,8 +99,6 @@ class Client:
from calibre_extensions.winsapi import (
SPF_ASYNC, SPF_IS_NOT_XML, SPF_PURGEBEFORESPEAK, SPF_IS_XML
)
import unicodedata
text = unicodedata.normalize('NFC', text)
flags = SPF_IS_XML if is_xml else SPF_IS_NOT_XML
self.current_stream_number = self.sp_voice.speak(text, flags | SPF_PURGEBEFORESPEAK | SPF_ASYNC, want_events)
return self.current_stream_number

View File

@ -44,6 +44,10 @@ def index_for_node(node, node_list):
return entry.offset
def tts_word_regex():
return /[\p{L}\p{M}]+/gu
def tts_data(text_node, offset):
offset_in_flat_text = offset or 0
if not cache.text_map:
@ -55,7 +59,7 @@ def tts_data(text_node, offset):
last = None
marked_text = v'[]'
text = cache.text_map.flat_text[offset_in_flat_text:]
for v'match of text.matchAll(/\w+/g)':
for v'match of text.matchAll(tts_word_regex())':
start = match.index
if first:
first = False
@ -162,7 +166,7 @@ def select_tts_mark(idx_in_flat_text):
window.getSelection().removeAllRanges()
if not cache.text_map:
cache.text_map = build_text_map()
r = /\w+/g
r = tts_word_regex()
r.lastIndex = idx_in_flat_text
match = v'r.exec(cache.text_map.flat_text)'
word_length = 5