From 41b0ca1461261a44c8c382980db872f13a17799c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 17 Feb 2025 20:56:32 +0530 Subject: [PATCH] More work on kepubify --- src/calibre/ebooks/oeb/polish/kepubify.py | 91 +++++++++++++++++-- .../ebooks/oeb/polish/tests/kepubify.py | 19 ++++ 2 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 src/calibre/ebooks/oeb/polish/tests/kepubify.py diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py index 6206c989ed..41c9c15f0b 100644 --- a/src/calibre/ebooks/oeb/polish/kepubify.py +++ b/src/calibre/ebooks/oeb/polish/kepubify.py @@ -12,16 +12,27 @@ # * Cover marking in the OPF # * Markup cleanup (remove various things that trip up the Kobo renderer) +import re + from calibre.ebooks.oeb.base import XHTML, XPath -from calibre.ebooks.oeb.parse_utils import merge_multiple_html_heads_and_bodies +from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies +from calibre.ebooks.oeb.polish.tts import lang_for_elem from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing +from calibre.spell.break_iterator import sentence_positions +from calibre.utils.localization import canonicalize_lang, get_lang KOBO_STYLE_HACKS = 'kobostylehacks' OUTER_DIV_ID = 'book-columns' INNER_DIV_ID = 'book-inner' +SKIPPED_TAGS = frozenset(( + 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math' +)) +BLOCK_TAGS = frozenset(( + 'p', 'ol', 'ul', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', +)) -def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0;}', cls=KOBO_STYLE_HACKS) -> bool: +def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0; }', cls=KOBO_STYLE_HACKS) -> bool: def add(parent): e = parent.makeelement(XHTML('style'), type='text/css') @@ -44,7 +55,7 @@ def remove_kobo_styles(root): def wrap_body_contents(body): - # To match official KEPUBs. Kobo wraps the body with two div tags, + # Kobo wraps the body with two div tags, # div#book-columns > div#book-inner to provide a target for applying # pagination styles. for elem in XPath(f'//*[@id="{OUTER_DIV_ID}" or @id={INNER_DIV_ID}]')(body): @@ -58,6 +69,7 @@ def wrap_body_contents(body): inner.append(child) del body[:] body.append(outer) + return inner def unwrap_body_contents(body): @@ -72,10 +84,71 @@ def unwrap_body_contents(body): body.text = text -def add_kobo_markup_to_html(root): +def add_kobo_spans(inner, root_lang): + stack = [] + a, p = stack.append, stack.pop + a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang))) + paranum, segnum = 0, 0 + increment_next_para = True + span_tag_name = XHTML('span') + leading_whitespace_pat = re.compile(r'^\s+') + + def kobo_span(parent): + nonlocal paranum, segnum + segnum += 1 + return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'}) + + def wrap_text_in_spans(text: str, parent, at: int, lang: str) -> str | None: + nonlocal increment_next_para, paranum, segnum + if increment_next_para: + paranum += 1 + segnum = 0 + increment_next_para = False + stripped = leading_whitespace_pat.sub('', text) + ws = None + if num := len(text) - len(stripped): + ws = text[:num] + if at: + parent[at-1].tail = ws + else: + parent.text = ws + for pos, sz in sentence_positions(stripped, lang): + s = kobo_span(parent) + s.text = stripped[pos:pos+sz] + parent.insert(at, s) + + while stack: + node, parent, tagname, node_lang = p() + if parent is not None: + wrap_text_in_spans(node, parent, tagname, node_lang) + continue + if not increment_next_para and tagname in BLOCK_TAGS: + increment_next_para = True + if node.text: + wrap_text_in_spans(node.text, node, 0, node_lang) + for i, child in enumerate(reversed(node)): + i = len(node) - 1 - i + if child.tail: + a((child.tail, node, i + 1, node_lang)) + if isinstance(child.tag, 'str'): + child_name = barename(child.tag).lower() + if child_name == 'img': + increment_next_para = False + paranum += 1 + segnum = 0 + w = kobo_span(node) + w.append(child) + node[i] = w + elif child_name not in SKIPPED_TAGS: + a((child, None, child_name, lang_for_elem(child, node_lang))) + + +def add_kobo_markup_to_html(root, metadata_lang): + root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en') add_style(root) for body in XPath('./h:body')(root): - wrap_body_contents(body) + inner = wrap_body_contents(body) + add_kobo_spans(inner, lang_for_elem(body, root_lang)) def remove_kobo_markup_from_html(root): @@ -84,7 +157,11 @@ def remove_kobo_markup_from_html(root): unwrap_body_contents(body) -def kepubify_html(root): +def kepubify_html(root, metadata_lang='en'): remove_kobo_markup_from_html(root) merge_multiple_html_heads_and_bodies(root) - add_kobo_markup_to_html(root) + add_kobo_markup_to_html(root, metadata_lang) + + +def kepubify(container): + lang = container.mi.language diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py new file mode 100644 index 0000000000..de9d40cbf3 --- /dev/null +++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# License: GPLv3 Copyright: 2025, Kovid Goyal + + +from calibre.ebooks.oeb.base import serialize +from calibre.ebooks.oeb.polish.kepubify import kepubify_html +from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse +from calibre.ebooks.oeb.polish.tests.base import BaseTest + + +class KepubifyTests(BaseTest): + + def test_kepubify_html(self): + for src, expected in { + }.items(): + root = parse(src) + kepubify_html(root) + actual = serialize(root, 'text/html').decode('utf-8') + self.assertEqual(expected, actual)