...

2025-12-14 17:15:06 -05:00 · 2024-10-15 16:19:14 +05:30 · 2024-10-15 16:19:14 +05:30 · 84aa726181
commit 84aa726181
parent 690988033e
1 changed files with 37 additions and 35 deletions
--- a/src/calibre/ebooks/oeb/polish/tts.py
+++ b/src/calibre/ebooks/oeb/polish/tts.py
@ -1,11 +1,17 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2024, Kovid Goyal <kovid at kovidgoyal.net>

+import json
 from collections import defaultdict
 from contextlib import suppress
 from typing import NamedTuple

+from lxml.etree import ElementBase as Element
+from lxml.etree import tostring as _tostring
+
+from calibre.ebooks.oeb.base import barename
 from calibre.spell.break_iterator import sentence_positions
+from calibre.utils.localization import canonicalize_lang, get_lang


 class Sentence(NamedTuple):
@ -15,36 +21,39 @@ class Sentence(NamedTuple):
    voice : str


+def tostring(x) -> str:
+    return _tostring(x, encoding='unicode')
+
+
+def lang_for_elem(elem, parent_lang):
+    return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
+
+
+def has_text(elem):
+    if elem.text and elem.text.strip():
+        return True
+    for child in elem:
+        if child.tail and child.tail.strip():
+            return True
+    return False
+
+
+class Chunk(NamedTuple):
+    child: Element | None
+    text: str
+    start_at: int
+    is_tail: bool = False
+
+
+continued_tag_names = frozenset({
+    'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
+})
+ignored_tag_names = frozenset({
+    'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
+})
+

 def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
-    import json
-
-    from lxml.etree import ElementBase as Element
-    from lxml.etree import tostring as _tostring
-
-    from calibre.ebooks.oeb.base import barename
-    from calibre.utils.localization import canonicalize_lang, get_lang
-    continued_tag_names = frozenset({
-        'a', 'span', 'em', 'strong', 'b', 'i', 'u', 'code', 'sub', 'sup', 'cite', 'q', 'kbd'
-    })
-    ignored_tag_names = frozenset({
-        'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
-    })
-
-    def tostring(x) -> str:
-        return _tostring(x, encoding='unicode')
-
-    def lang_for_elem(elem, parent_lang):
-        return canonicalize_lang(elem.get('lang') or elem.get('xml_lang') or elem.get('{http://www.w3.org/XML/1998/namespace}lang')) or parent_lang
-
-    def has_text(elem):
-        if elem.text and elem.text.strip():
-            return True
-        for child in elem:
-            if child.tail and child.tail.strip():
-                return True
-        return False
-
    root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(lang or get_lang())) or 'en')
    root_voice = voice
    seen_ids = set(root.xpath('//*/@id'))
@ -52,13 +61,6 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
    ans = []
    clones_map = defaultdict(list)

-    class Chunk(NamedTuple):
-        child: Element | None
-        text: str
-        start_at: int
-        is_tail: bool = False
-
-
    class Parent:

        def __init__(self, elem, tag_name, parent_lang, parent_voice, child_lang=''):