From 5e9f0cc563dc35a0e95eb5976ae021431f8b5bbe Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 20 Feb 2025 14:02:32 +0530
Subject: [PATCH] Code to unwrap kobo spans

---
 src/calibre/ebooks/oeb/polish/kepubify.py     | 27 ++++++++++++++++++-
 .../ebooks/oeb/polish/tests/kepubify.py       | 16 ++++++-----
 2 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py
index 20193d1657..164846c8ca 100644
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@@ -27,6 +27,7 @@ from calibre.utils.localization import canonicalize_lang, get_lang
 KOBO_STYLE_HACKS = 'kobostylehacks'
 OUTER_DIV_ID = 'book-columns'
 INNER_DIV_ID = 'book-inner'
+KOBO_SPAN_CLASS = 'koboSpan'
 SKIPPED_TAGS = frozenset((
     '', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
 ))
@@ -103,7 +104,7 @@ def add_kobo_spans(inner, root_lang):
     def kobo_span(parent):
         nonlocal paranum, segnum
         segnum += 1
-        return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'})
+        return parent.makeelement(span_tag_name, attrib={'class': KOBO_SPAN_CLASS, 'id': f'kobo.{paranum}.{segnum}'})
 
     def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
         nonlocal increment_next_para, paranum, segnum
@@ -163,6 +164,29 @@ def add_kobo_spans(inner, root_lang):
             wrap_text_in_spans(node.text, node, None, node_lang)
 
 
+def unwrap(span: etree.Element) -> None:
+    p = span.getparent()
+    idx = p.index(span)
+    del p[idx]
+    if len(span):
+        p.insert(idx, span[0])
+    else:
+        text = span.text + (span.tail or '')
+        if idx > 0:
+            prev = p[idx-1]
+            prev.tail = (prev.tail or '') + text
+        else:
+            p.text = (p.text or '') + text
+
+
+def remove_kobo_spans(body: etree.Element) -> bool:
+    found = False
+    for span in XPath(f'//h:span[@class="{KOBO_SPAN_CLASS}" and starts-with(@id, "kobo.")]')(body):
+        unwrap(span)
+        found = True
+    return found
+
+
 def add_kobo_markup_to_html(root, metadata_lang):
     root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en')
     add_style(root)
@@ -175,6 +199,7 @@ def remove_kobo_markup_from_html(root):
     remove_kobo_styles(root)
     for body in XPath('./h:body')(root):
         unwrap_body_contents(body)
+        remove_kobo_spans(body)
 
 
 def serialize_html(root) -> bytes:
diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
index b7dbbe037c..623bf2a0c8 100644
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@@ -2,7 +2,8 @@
 # License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
 
 
-from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, serialize_html
+from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, remove_kobo_markup_from_html, serialize_html
+from calibre.ebooks.oeb.polish.parsing import parse
 from calibre.ebooks.oeb.polish.tests.base import BaseTest
 
 
@@ -56,8 +57,11 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
             '<div><script>1 < 2 & 3</script>':  # escaping with cdata note that kepubify doesnt do this
             '<div><script><![CDATA[1 < 2 & 3]]></script></div>',
         }.items():
-            with self.subTest(src=src):
-                root = kepubify_html_data(src)
-                actual = serialize_html(root).decode('utf-8')
-                actual = actual[len(prefix):-len(suffix)]
-                self.assertEqual(expected, actual)
+            root = kepubify_html_data(src)
+            actual = serialize_html(root).decode('utf-8')
+            actual = actual[len(prefix):-len(suffix)]
+            self.assertEqual(expected, actual)
+            expected = serialize_html(parse(src)).decode('utf-8')
+            remove_kobo_markup_from_html(root)
+            actual = serialize_html(root).decode('utf-8')
+            self.assertEqual(expected, actual)