Code to unwrap kobo spans

2025-07-09 03:04:10 -04:00 · 2025-02-20 14:02:32 +05:30 · 2025-02-20 14:02:32 +05:30 · 5e9f0cc563
commit 5e9f0cc563
parent c7557b23b2
2 changed files with 36 additions and 7 deletions
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@ -27,6 +27,7 @@ from calibre.utils.localization import canonicalize_lang, get_lang
 KOBO_STYLE_HACKS = 'kobostylehacks'
 OUTER_DIV_ID = 'book-columns'
 INNER_DIV_ID = 'book-inner'
+KOBO_SPAN_CLASS = 'koboSpan'
 SKIPPED_TAGS = frozenset((
    '', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
 ))
@ -103,7 +104,7 @@ def add_kobo_spans(inner, root_lang):
    def kobo_span(parent):
        nonlocal paranum, segnum
        segnum += 1
-        return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'})
+        return parent.makeelement(span_tag_name, attrib={'class': KOBO_SPAN_CLASS, 'id': f'kobo.{paranum}.{segnum}'})

    def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
        nonlocal increment_next_para, paranum, segnum
@ -163,6 +164,29 @@ def add_kobo_spans(inner, root_lang):
            wrap_text_in_spans(node.text, node, None, node_lang)


+def unwrap(span: etree.Element) -> None:
+    p = span.getparent()
+    idx = p.index(span)
+    del p[idx]
+    if len(span):
+        p.insert(idx, span[0])
+    else:
+        text = span.text + (span.tail or '')
+        if idx > 0:
+            prev = p[idx-1]
+            prev.tail = (prev.tail or '') + text
+        else:
+            p.text = (p.text or '') + text
+
+
+def remove_kobo_spans(body: etree.Element) -> bool:
+    found = False
+    for span in XPath(f'//h:span[@class="{KOBO_SPAN_CLASS}" and starts-with(@id, "kobo.")]')(body):
+        unwrap(span)
+        found = True
+    return found
+
+
 def add_kobo_markup_to_html(root, metadata_lang):
    root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en')
    add_style(root)
@ -175,6 +199,7 @@ def remove_kobo_markup_from_html(root):
    remove_kobo_styles(root)
    for body in XPath('./h:body')(root):
        unwrap_body_contents(body)
+        remove_kobo_spans(body)


 def serialize_html(root) -> bytes:
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@ -2,7 +2,8 @@
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>


-from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, serialize_html
+from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, remove_kobo_markup_from_html, serialize_html
+from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tests.base import BaseTest


@ -56,8 +57,11 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
            '<div><script>1 < 2 & 3</script>':  # escaping with cdata note that kepubify doesnt do this
            '<div><script><![CDATA[1 < 2 & 3]]></script></div>',
        }.items():
-            with self.subTest(src=src):
            root = kepubify_html_data(src)
            actual = serialize_html(root).decode('utf-8')
            actual = actual[len(prefix):-len(suffix)]
            self.assertEqual(expected, actual)
+            expected = serialize_html(parse(src)).decode('utf-8')
+            remove_kobo_markup_from_html(root)
+            actual = serialize_html(root).decode('utf-8')
+            self.assertEqual(expected, actual)