Function to unwrap marked sentences

2025-11-30 02:05:04 -05:00 · 2024-10-15 16:53:58 +05:30 · 2024-10-15 16:53:58 +05:30 · 7e709b4fc7
commit 7e709b4fc7
parent 6a5799edd9
3 changed files with 24 additions and 8 deletions
--- a/src/calibre/ebooks/html_transform_rules.py
+++ b/src/calibre/ebooks/html_transform_rules.py
@ -5,13 +5,13 @@
 import re
 from functools import partial

-from css_selectors.select import Select, get_parsed_selector
 from html5_parser import parse
 from lxml import etree

 from calibre.ebooks.metadata.tag_mapper import uniq
 from calibre.ebooks.oeb.base import OEB_DOCS, XPath
 from calibre.ebooks.oeb.parse_utils import XHTML
+from css_selectors.select import Select, get_parsed_selector


 def non_empty_validator(label, val):
--- a/src/calibre/ebooks/oeb/polish/tests/structure.py
+++ b/src/calibre/ebooks/oeb/polish/tests/structure.py
@ -197,7 +197,14 @@ class Structure(BaseTest):
        from html5_parser import parse
        from lxml import html

-        from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html
+        from calibre.ebooks.oeb.polish.tts import id_prefix, mark_sentences_in_html, unmark_sentences_in_html
+
+        def normalize_markup(root):
+            actual = html.tostring(root, encoding='unicode')
+            actual = actual[actual.find('<body'):]
+            actual = actual[:actual.rfind('</body>')]
+            return actual.replace(id_prefix, '')
+
        for text, expected in reversed({
            '<p id=1>hello cruel world': '<body><p id="1"><span id="1">hello cruel world</span></p>',

@ -234,12 +241,12 @@ class Structure(BaseTest):
            '<body><p><span id="1">Hello, </span><span data-calibre-tts="moose"><span id="2">world!</span></span></p>',
        }.items()):
            root = parse(text, namespace_elements=True)
+            orig = normalize_markup(root)
            mark_sentences_in_html(root)
-            actual = html.tostring(root, encoding='unicode')
-            actual = actual[actual.find('<body'):]
-            actual = actual[:actual.rfind('</body>')]
-            actual = actual.replace('cttsw-', '')
-            self.assertEqual(expected, actual)
+            marked = normalize_markup(root)
+            self.assertEqual(expected, marked)
+            unmark_sentences_in_html(root)
+            self.assertEqual(orig, normalize_markup(root), f'Unmarking failed for {marked}')
        sentences = mark_sentences_in_html(parse('<p lang="en">Hello, <span lang="fr">world!'))
        self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra'))

--- a/src/calibre/ebooks/oeb/polish/tts.py
+++ b/src/calibre/ebooks/oeb/polish/tts.py
@ -9,6 +9,7 @@ from typing import NamedTuple
 from lxml.etree import ElementBase as Element
 from lxml.etree import tostring as _tostring

+from calibre.ebooks.html_transform_rules import unwrap_tag
 from calibre.ebooks.oeb.base import barename
 from calibre.spell.break_iterator import sentence_positions
 from calibre.utils.localization import canonicalize_lang, get_lang
@ -51,6 +52,14 @@ continued_tag_names = frozenset({
 ignored_tag_names = frozenset({
    'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc',
 })
+id_prefix = 'cttsw-'
+
+
+def unmark_sentences_in_html(root):
+    for x in root.xpath(f'//*[starts-with(@id, "{id_prefix}")]'):
+        x.attrib.pop('id')
+        if not x.attrib and x.tag and x.tag.endswith('span'):
+            unwrap_tag(x)


 def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]:
@ -106,7 +115,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten
        def make_into_wrapper(self, elem: Element) -> str:
            nonlocal id_counter
            while True:
-                q = f'cttsw-{id_counter}'
+                q = f'{id_prefix}{id_counter}'
                if q not in seen_ids:
                    elem.set('id', q)
                    seen_ids.add(q)