Code to unwrap kobo spans

This commit is contained in:
Kovid Goyal 2025-02-20 14:02:32 +05:30
parent c7557b23b2
commit 5e9f0cc563
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 36 additions and 7 deletions

View File

@ -27,6 +27,7 @@ from calibre.utils.localization import canonicalize_lang, get_lang
KOBO_STYLE_HACKS = 'kobostylehacks'
OUTER_DIV_ID = 'book-columns'
INNER_DIV_ID = 'book-inner'
KOBO_SPAN_CLASS = 'koboSpan'
SKIPPED_TAGS = frozenset((
'', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
))
@ -103,7 +104,7 @@ def add_kobo_spans(inner, root_lang):
def kobo_span(parent):
nonlocal paranum, segnum
segnum += 1
return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'})
return parent.makeelement(span_tag_name, attrib={'class': KOBO_SPAN_CLASS, 'id': f'kobo.{paranum}.{segnum}'})
def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
nonlocal increment_next_para, paranum, segnum
@ -163,6 +164,29 @@ def add_kobo_spans(inner, root_lang):
wrap_text_in_spans(node.text, node, None, node_lang)
def unwrap(span: etree.Element) -> None:
p = span.getparent()
idx = p.index(span)
del p[idx]
if len(span):
p.insert(idx, span[0])
else:
text = span.text + (span.tail or '')
if idx > 0:
prev = p[idx-1]
prev.tail = (prev.tail or '') + text
else:
p.text = (p.text or '') + text
def remove_kobo_spans(body: etree.Element) -> bool:
found = False
for span in XPath(f'//h:span[@class="{KOBO_SPAN_CLASS}" and starts-with(@id, "kobo.")]')(body):
unwrap(span)
found = True
return found
def add_kobo_markup_to_html(root, metadata_lang):
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en')
add_style(root)
@ -175,6 +199,7 @@ def remove_kobo_markup_from_html(root):
remove_kobo_styles(root)
for body in XPath('./h:body')(root):
unwrap_body_contents(body)
remove_kobo_spans(body)
def serialize_html(root) -> bytes:

View File

@ -2,7 +2,8 @@
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, serialize_html
from calibre.ebooks.oeb.polish.kepubify import kepubify_html_data, remove_kobo_markup_from_html, serialize_html
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.polish.tests.base import BaseTest
@ -56,8 +57,11 @@ div#book-inner { margin-top: 0; margin-bottom: 0; }</style></head><body><div id=
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesnt do this
'<div><script><![CDATA[1 < 2 & 3]]></script></div>',
}.items():
with self.subTest(src=src):
root = kepubify_html_data(src)
actual = serialize_html(root).decode('utf-8')
actual = actual[len(prefix):-len(suffix)]
self.assertEqual(expected, actual)
expected = serialize_html(parse(src)).decode('utf-8')
remove_kobo_markup_from_html(root)
actual = serialize_html(root).decode('utf-8')
self.assertEqual(expected, actual)