From 41b0ca1461261a44c8c382980db872f13a17799c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 17 Feb 2025 20:56:32 +0530
Subject: [PATCH] More work on kepubify

---
 src/calibre/ebooks/oeb/polish/kepubify.py     | 91 +++++++++++++++++--
 .../ebooks/oeb/polish/tests/kepubify.py       | 19 ++++
 2 files changed, 103 insertions(+), 7 deletions(-)
 create mode 100644 src/calibre/ebooks/oeb/polish/tests/kepubify.py

diff --git a/src/calibre/ebooks/oeb/polish/kepubify.py b/src/calibre/ebooks/oeb/polish/kepubify.py
index 6206c989ed..41c9c15f0b 100644
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@@ -12,16 +12,27 @@
 #     * Cover marking in the OPF
 #     * Markup cleanup (remove various things that trip up the Kobo renderer)
 
+import re
+
 from calibre.ebooks.oeb.base import XHTML, XPath
-from calibre.ebooks.oeb.parse_utils import merge_multiple_html_heads_and_bodies
+from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
+from calibre.ebooks.oeb.polish.tts import lang_for_elem
 from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
+from calibre.spell.break_iterator import sentence_positions
+from calibre.utils.localization import canonicalize_lang, get_lang
 
 KOBO_STYLE_HACKS = 'kobostylehacks'
 OUTER_DIV_ID = 'book-columns'
 INNER_DIV_ID = 'book-inner'
+SKIPPED_TAGS = frozenset((
+    'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
+))
+BLOCK_TAGS = frozenset((
+    'p', 'ol', 'ul', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+))
 
 
-def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0;}', cls=KOBO_STYLE_HACKS) -> bool:
+def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0; }', cls=KOBO_STYLE_HACKS) -> bool:
 
     def add(parent):
         e = parent.makeelement(XHTML('style'), type='text/css')
@@ -44,7 +55,7 @@ def remove_kobo_styles(root):
 
 
 def wrap_body_contents(body):
-    # To match official KEPUBs. Kobo wraps the body with two div tags,
+    # Kobo wraps the body with two div tags,
     # div#book-columns > div#book-inner to provide a target for applying
     # pagination styles.
     for elem in XPath(f'//*[@id="{OUTER_DIV_ID}" or @id={INNER_DIV_ID}]')(body):
@@ -58,6 +69,7 @@ def wrap_body_contents(body):
         inner.append(child)
     del body[:]
     body.append(outer)
+    return inner
 
 
 def unwrap_body_contents(body):
@@ -72,10 +84,71 @@ def unwrap_body_contents(body):
     body.text = text
 
 
-def add_kobo_markup_to_html(root):
+def add_kobo_spans(inner, root_lang):
+    stack = []
+    a, p = stack.append, stack.pop
+    a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
+    paranum, segnum = 0, 0
+    increment_next_para = True
+    span_tag_name = XHTML('span')
+    leading_whitespace_pat = re.compile(r'^\s+')
+
+    def kobo_span(parent):
+        nonlocal paranum, segnum
+        segnum += 1
+        return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'})
+
+    def wrap_text_in_spans(text: str, parent, at: int, lang: str) -> str | None:
+        nonlocal increment_next_para, paranum, segnum
+        if increment_next_para:
+            paranum += 1
+            segnum = 0
+            increment_next_para = False
+        stripped = leading_whitespace_pat.sub('', text)
+        ws = None
+        if num := len(text) - len(stripped):
+            ws = text[:num]
+        if at:
+            parent[at-1].tail = ws
+        else:
+            parent.text = ws
+        for pos, sz in sentence_positions(stripped, lang):
+            s = kobo_span(parent)
+            s.text = stripped[pos:pos+sz]
+            parent.insert(at, s)
+
+    while stack:
+        node, parent, tagname, node_lang = p()
+        if parent is not None:
+            wrap_text_in_spans(node, parent, tagname, node_lang)
+            continue
+        if not increment_next_para and tagname in BLOCK_TAGS:
+            increment_next_para = True
+        if node.text:
+            wrap_text_in_spans(node.text, node, 0, node_lang)
+        for i, child in enumerate(reversed(node)):
+            i = len(node) - 1 - i
+            if child.tail:
+                a((child.tail, node, i + 1, node_lang))
+            if isinstance(child.tag, 'str'):
+                child_name = barename(child.tag).lower()
+                if child_name == 'img':
+                    increment_next_para = False
+                    paranum += 1
+                    segnum = 0
+                    w = kobo_span(node)
+                    w.append(child)
+                    node[i] = w
+                elif child_name not in SKIPPED_TAGS:
+                    a((child, None, child_name, lang_for_elem(child, node_lang)))
+
+
+def add_kobo_markup_to_html(root, metadata_lang):
+    root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en')
     add_style(root)
     for body in XPath('./h:body')(root):
-        wrap_body_contents(body)
+        inner = wrap_body_contents(body)
+        add_kobo_spans(inner, lang_for_elem(body, root_lang))
 
 
 def remove_kobo_markup_from_html(root):
@@ -84,7 +157,11 @@ def remove_kobo_markup_from_html(root):
         unwrap_body_contents(body)
 
 
-def kepubify_html(root):
+def kepubify_html(root, metadata_lang='en'):
     remove_kobo_markup_from_html(root)
     merge_multiple_html_heads_and_bodies(root)
-    add_kobo_markup_to_html(root)
+    add_kobo_markup_to_html(root, metadata_lang)
+
+
+def kepubify(container):
+    lang = container.mi.language
diff --git a/src/calibre/ebooks/oeb/polish/tests/kepubify.py b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
new file mode 100644
index 0000000000..de9d40cbf3
--- /dev/null
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
+
+
+from calibre.ebooks.oeb.base import serialize
+from calibre.ebooks.oeb.polish.kepubify import kepubify_html
+from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse
+from calibre.ebooks.oeb.polish.tests.base import BaseTest
+
+
+class KepubifyTests(BaseTest):
+
+    def test_kepubify_html(self):
+        for src, expected in {
+        }.items():
+            root = parse(src)
+            kepubify_html(root)
+            actual = serialize(root, 'text/html').decode('utf-8')
+            self.assertEqual(expected, actual)