mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
TTS: Fix word segmentation not working with non-ascii chars because JavaScript's regex engine has a non-unicode aware \w operator
This commit is contained in:
parent
08d7840d21
commit
eedd6c7751
@ -99,8 +99,6 @@ class Client:
|
||||
from calibre_extensions.winsapi import (
|
||||
SPF_ASYNC, SPF_IS_NOT_XML, SPF_PURGEBEFORESPEAK, SPF_IS_XML
|
||||
)
|
||||
import unicodedata
|
||||
text = unicodedata.normalize('NFC', text)
|
||||
flags = SPF_IS_XML if is_xml else SPF_IS_NOT_XML
|
||||
self.current_stream_number = self.sp_voice.speak(text, flags | SPF_PURGEBEFORESPEAK | SPF_ASYNC, want_events)
|
||||
return self.current_stream_number
|
||||
|
@ -44,6 +44,10 @@ def index_for_node(node, node_list):
|
||||
return entry.offset
|
||||
|
||||
|
||||
def tts_word_regex():
|
||||
return /[\p{L}\p{M}]+/gu
|
||||
|
||||
|
||||
def tts_data(text_node, offset):
|
||||
offset_in_flat_text = offset or 0
|
||||
if not cache.text_map:
|
||||
@ -55,7 +59,7 @@ def tts_data(text_node, offset):
|
||||
last = None
|
||||
marked_text = v'[]'
|
||||
text = cache.text_map.flat_text[offset_in_flat_text:]
|
||||
for v'match of text.matchAll(/\w+/g)':
|
||||
for v'match of text.matchAll(tts_word_regex())':
|
||||
start = match.index
|
||||
if first:
|
||||
first = False
|
||||
@ -162,7 +166,7 @@ def select_tts_mark(idx_in_flat_text):
|
||||
window.getSelection().removeAllRanges()
|
||||
if not cache.text_map:
|
||||
cache.text_map = build_text_map()
|
||||
r = /\w+/g
|
||||
r = tts_word_regex()
|
||||
r.lastIndex = idx_in_flat_text
|
||||
match = v'r.exec(cache.text_map.flat_text)'
|
||||
word_length = 5
|
||||
|
Loading…
x
Reference in New Issue
Block a user