diff --git a/src/calibre/ebooks/html_transform_rules.py b/src/calibre/ebooks/html_transform_rules.py index f5e2634c88..0d3187f1c1 100644 --- a/src/calibre/ebooks/html_transform_rules.py +++ b/src/calibre/ebooks/html_transform_rules.py @@ -5,13 +5,13 @@ import re from functools import partial -from css_selectors.select import Select, get_parsed_selector from html5_parser import parse from lxml import etree from calibre.ebooks.metadata.tag_mapper import uniq from calibre.ebooks.oeb.base import OEB_DOCS, XPath from calibre.ebooks.oeb.parse_utils import XHTML +from css_selectors.select import Select, get_parsed_selector def non_empty_validator(label, val): diff --git a/src/calibre/ebooks/oeb/polish/tests/structure.py b/src/calibre/ebooks/oeb/polish/tests/structure.py index 491abe781b..01cbc566a6 100644 --- a/src/calibre/ebooks/oeb/polish/tests/structure.py +++ b/src/calibre/ebooks/oeb/polish/tests/structure.py @@ -197,7 +197,14 @@ class Structure(BaseTest): from html5_parser import parse from lxml import html - from calibre.ebooks.oeb.polish.tts import mark_sentences_in_html + from calibre.ebooks.oeb.polish.tts import id_prefix, mark_sentences_in_html, unmark_sentences_in_html + + def normalize_markup(root): + actual = html.tostring(root, encoding='unicode') + actual = actual[actual.find('
')] + return actual.replace(id_prefix, '') + for text, expected in reversed({ 'hello cruel world': '
hello cruel world
', @@ -234,12 +241,12 @@ class Structure(BaseTest): 'Hello, world!
', }.items()): root = parse(text, namespace_elements=True) + orig = normalize_markup(root) mark_sentences_in_html(root) - actual = html.tostring(root, encoding='unicode') - actual = actual[actual.find('')] - actual = actual.replace('cttsw-', '') - self.assertEqual(expected, actual) + marked = normalize_markup(root) + self.assertEqual(expected, marked) + unmark_sentences_in_html(root) + self.assertEqual(orig, normalize_markup(root), f'Unmarking failed for {marked}') sentences = mark_sentences_in_html(parse('Hello, world!')) self.assertEqual(tuple(s.lang for s in sentences), ('eng', 'fra')) diff --git a/src/calibre/ebooks/oeb/polish/tts.py b/src/calibre/ebooks/oeb/polish/tts.py index c76143eca3..6a14d306de 100644 --- a/src/calibre/ebooks/oeb/polish/tts.py +++ b/src/calibre/ebooks/oeb/polish/tts.py @@ -9,6 +9,7 @@ from typing import NamedTuple from lxml.etree import ElementBase as Element from lxml.etree import tostring as _tostring +from calibre.ebooks.html_transform_rules import unwrap_tag from calibre.ebooks.oeb.base import barename from calibre.spell.break_iterator import sentence_positions from calibre.utils.localization import canonicalize_lang, get_lang @@ -51,6 +52,14 @@ continued_tag_names = frozenset({ ignored_tag_names = frozenset({ 'img', 'object', 'script', 'style', 'head', 'title', 'form', 'input', 'br', 'hr', 'map', 'textarea', 'svg', 'math', 'rp', 'rt', 'rtc', }) +id_prefix = 'cttsw-' + + +def unmark_sentences_in_html(root): + for x in root.xpath(f'//*[starts-with(@id, "{id_prefix}")]'): + x.attrib.pop('id') + if not x.attrib and x.tag and x.tag.endswith('span'): + unwrap_tag(x) def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Sentence]: @@ -106,7 +115,7 @@ def mark_sentences_in_html(root, lang: str = '', voice: str = '') -> list[Senten def make_into_wrapper(self, elem: Element) -> str: nonlocal id_counter while True: - q = f'cttsw-{id_counter}' + q = f'{id_prefix}{id_counter}' if q not in seen_ids: elem.set('id', q) seen_ids.add(q)