From 5e9f0cc563dc35a0e95eb5976ae021431f8b5bbe Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 20 Feb 2025 14:02:32 +0530 Subject: [PATCH] Code to unwrap kobo spans --- src/calibre/ebooks/oeb/polish/kepubify.py | 27 ++++++++++++++++++- .../ebooks/oeb/polish/tests/kepubify.py | 16 ++++++----- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py index 20193d1657..164846c8ca 100644 --- a/src/calibre/ebooks/oeb/polish/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/kepubify.py @@ -27,6 +27,7 @@ from calibre.utils.localization import canonicalize_lang, get_lang KOBO_STYLE_HACKS = 'kobostylehacks' OUTER_DIV_ID = 'book-columns' INNER_DIV_ID = 'book-inner' +KOBO_SPAN_CLASS = 'koboSpan' SKIPPED_TAGS = frozenset(( '', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math' )) @@ -103,7 +104,7 @@ def add_kobo_spans(inner, root_lang): def kobo_span(parent): nonlocal paranum, segnum segnum += 1 - return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'}) + return parent.makeelement(span_tag_name, attrib={'class': KOBO_SPAN_CLASS, 'id': f'kobo.{paranum}.{segnum}'}) def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None: nonlocal increment_next_para, paranum, segnum @@ -163,6 +164,29 @@ def add_kobo_spans(inner, root_lang): wrap_text_in_spans(node.text, node, None, node_lang) +def unwrap(span: etree.Element) -> None: + p = span.getparent() + idx = p.index(span) + del p[idx] + if len(span): + p.insert(idx, span[0]) + else: + text = span.text + (span.tail or '') + if idx > 0: + prev = p[idx-1] + prev.tail = (prev.tail or '') + text + else: + p.text = (p.text or '') + text + + +def remove_kobo_spans(body: etree.Element) -> bool: + found = False + for span in XPath(f'//h:span[@class="{KOBO_SPAN_CLASS}" and starts-with(@id, "kobo.")]')(body): + unwrap(span) + found = True + return found + + def add_kobo_markup_to_html(root, metadata_lang): root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en') add_style(root) @@ -175,6 +199,7 @@ def remove_kobo_markup_from_html(root): remove_kobo_styles(root) for body in XPath('./h:body')(root): unwrap_body_contents(body) + remove_kobo_spans(body) def serialize_html(root) -> bytes: diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py index b7dbbe037c..623bf2a0c8 100644 --- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py @@ -2,7 +2,8 @@ # License: GPLv3 Copyright: 2025, Kovid Goyal -from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, serialize_html +from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, remove_kobo_markup_from_html, serialize_html +from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.tests.base import BaseTest @@ -56,8 +57,11 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }
', }.items(): - with self.subTest(src=src): - root = kepubify_html_data(src) - actual = serialize_html(root).decode('utf-8') - actual = actual[len(prefix):-len(suffix)] - self.assertEqual(expected, actual) + root = kepubify_html_data(src) + actual = serialize_html(root).decode('utf-8') + actual = actual[len(prefix):-len(suffix)] + self.assertEqual(expected, actual) + expected = serialize_html(parse(src)).decode('utf-8') + remove_kobo_markup_from_html(root) + actual = serialize_html(root).decode('utf-8') + self.assertEqual(expected, actual)