kepubify: Optionally move leading and trailing whitespace out of the kobo spans

2025-07-09 03:04:10 -04:00 · 2025-05-06 11:55:47 +05:30 · 2025-05-06 11:55:47 +05:30 · 6b67ed1e66
commit 6b67ed1e66
parent 3d48a0a94e
2 changed files with 84 additions and 36 deletions
--- a/src/calibre/ebooks/oeb/polish/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/kepubify.py
@ -67,6 +67,7 @@ class Options(NamedTuple):
    hyphenation_css: str = ''
    remove_widows_and_orphans: bool = False
    remove_at_page_rules: bool = False
+    prefer_justification: bool = False

    for_removal: bool = False

@ -158,14 +159,19 @@ def unwrap_body_contents(body):
    body.text = text


-def add_kobo_spans(inner, root_lang):
+def add_kobo_spans(inner, root_lang, prefer_justification=False):
    stack = []
    a, p = stack.append, stack.pop
    a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
    paranum, segnum = 0, 0
    increment_next_para = True
    span_tag_name = XHTML('span')
-    leading_whitespace_pat = re.compile(r'^\s+')
+    lstrip_pat = re.compile(r'^\s+')
+    rstrip_pat = re.compile(r'\s+$')
+    def lstrip(x):
+        return lstrip_pat.sub('', x)
+    def rstrip(x):
+        return rstrip_pat.sub('', x)

    def kobo_span(parent):
        nonlocal paranum, segnum
@ -174,30 +180,44 @@ def add_kobo_spans(inner, root_lang):

    def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
        nonlocal increment_next_para, paranum, segnum
-        if increment_next_para:
-            paranum += 1
-            segnum = 0
-            increment_next_para = False
+        text_with_leading_whitespace_removed = lstrip(text)
        try:
            at = 0 if after_child is None else parent.index(after_child) + 1
        except ValueError:  # wrapped child
            at = parent.index(after_child.getparent()) + 1
-        stripped = leading_whitespace_pat.sub('', text)
-        if not at and not stripped and not len(parent):
-            stripped = text
-        ws = None
-        if num := len(text) - len(stripped):
-            ws = text[:num]
-        before = None if stripped else ws
+
+        if increment_next_para:
+            paranum += 1
+            segnum = 0
+            increment_next_para = False
+
+        if not at and not text_with_leading_whitespace_removed and not len(parent):
+            # block tag with only whitespace
+            s = kobo_span(parent)
+            s.text = text
+            parent.text = None
+            parent.append(s)
+            return
+        leading_whitespace = None
+        if num := len(text) - len(text_with_leading_whitespace_removed):
+            leading_whitespace = text[:num]
+        before = None if text_with_leading_whitespace_removed and not prefer_justification else leading_whitespace
        if at:
            parent[at-1].tail = before
        else:
            parent.text = before
-        if stripped:
-            text = (ws + stripped) if ws else stripped
+        if not text_with_leading_whitespace_removed:
+            return
+        if not leading_whitespace or prefer_justification:
+            text = text_with_leading_whitespace_removed
        for pos, sz in sentence_positions(text, lang):
            s = kobo_span(parent)
-                s.text = text[pos:pos+sz]
+            s.text = inside_span = text[pos:pos+sz]
+            if prefer_justification:
+                inside_span_without_trailing_whitespace = rstrip(inside_span)
+                if tail_len := len(inside_span) - len(inside_span_without_trailing_whitespace):
+                    s.tail = inside_span[-tail_len:]
+                    s.text = inside_span_without_trailing_whitespace
            parent.insert(at, s)
            at += 1

@ -263,7 +283,7 @@ def add_kobo_markup_to_html(root: etree.Element, kobo_js_href: str, opts: Option
    add_style_and_script(root, kobo_js_href, opts)
    for body in XPath('./h:body')(root):
        inner = wrap_body_contents(body)
-        add_kobo_spans(inner, lang_for_elem(body, root_lang))
+        add_kobo_spans(inner, lang_for_elem(body, root_lang), opts.prefer_justification)


 def remove_kobo_markup_from_html(root):
--- a/src/calibre/ebooks/oeb/polish/tests/kepubify.py
+++ b/src/calibre/ebooks/oeb/polish/tests/kepubify.py
@ -57,6 +57,19 @@ class KepubifyTests(BaseTest):
 div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\
 <body><div id="book-columns"><div id="book-inner">'''
        suffix =  '</div></div></body></html>'
+
+        def perform(src, expected, prefer_justification):
+            opts = Options(remove_widows_and_orphans=True, remove_at_page_rules=True, prefer_justification=prefer_justification)
+            root = kepubify_html_data(src, KOBO_JS_NAME, opts)
+            actual = serialize_html(root).decode('utf-8')
+            actual = actual[len(prefix):-len(suffix)]
+            self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
+            expected = serialize_html(parse(src)).decode('utf-8')
+            opts = opts._replace(for_removal=True)
+            kepubify_parsed_html(root, KOBO_JS_NAME, opts)
+            actual = serialize_html(root).decode('utf-8')
+            self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
+
        for src, expected in {
            # basics
            '<p>one</p>  <p>\xa0</p><p>\xa0<i>a</i></p>':
@ -64,7 +77,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
            '<p>&#160;<i><span class="koboSpan" id="kobo.3.1">a</span></i></p>',

            '<p>Simple sentences. In a single paragraph.'
-            '<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.':
+            '<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.': (

            '<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
            '<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>'
@ -72,6 +85,14 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
            '<span class="koboSpan" id="kobo.2.5"> formatting. </span>'
            '<span class="koboSpan" id="kobo.2.6">Another.</span></p>',

+            # with prefer_justification
+            '<p><span class="koboSpan" id="kobo.1.1">Simple sentences.</span> <span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
+            '<p><span class="koboSpan" id="kobo.2.1">A sentence</span> <i><span class="koboSpan" id="kobo.2.2">with</span>'
+            ' <b><span class="koboSpan" id="kobo.2.3">nested</span></b><span class="koboSpan" id="kobo.2.4">, tailed</span></i> '
+            '<span class="koboSpan" id="kobo.2.5">formatting.</span> '
+            '<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
+            ),
+
            # img tags
            '<p>An image<img src="x">with tail<img src="b"><i>without':
            '<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>'
@ -85,22 +106,35 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
            '<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>',

            # nested block tags
-            '<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3':
+            '<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3': (
            '<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>'
            '<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>'
            '<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>'
            '<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>',
+            # with prefer_justification
+            '<div><span class="koboSpan" id="kobo.1.1">A div</span><div> <span class="koboSpan" id="kobo.1.2">nested.</span>'
+            '<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p> <span class="koboSpan" id="kobo.3.1">with nested block</span></p>'
+            ' <span class="koboSpan" id="kobo.3.2">tail1</span></li> <span class="koboSpan" id="kobo.3.3">tail2</span></ul>'
+            ' <span class="koboSpan" id="kobo.3.4">tail3</span></div></div>',
+            ),

            # skipped tags
-            '<div>Script: <script>a = 1</script> with tail':
+            '<div>Script: <script>a = 1</script> with tail': (
            '<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>',
-            '<div>Svg: <svg>mouse</svg><i> no tail':
+            '<div><span class="koboSpan" id="kobo.1.1">Script:</span> <script>a = 1</script> <span class="koboSpan" id="kobo.1.2">with tail</span></div>',
+            ),
+            '<div>Svg: <svg>mouse</svg><i> no tail': (
            '<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
            '<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>',
+            '<div><span class="koboSpan" id="kobo.1.1">Svg:</span> <svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
+            '<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
+            ),

            # encoding quirks
-            '<p>A\xa0nbsp;&nbsp;':
+            '<p>A\xa0nbsp;&nbsp;': (
            '<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>',
+            '<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;</span>&#160;</p>',
+            ),
            '<div><script>1 < 2 & 3</script>':  # escaping with cdata note that kepubify doesn't do this
            '<div><script><![CDATA[1 < 2 & 3]]></script></div>',

@ -110,13 +144,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
            f'div {{\n  -{CSS_COMMENT_COOKIE}-widows: 12;\n  color: red;\n}}</style>'
            '<span class="koboSpan" id="kobo.1.1">Some</span></div>'
        }.items():
-            opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True)
-            root = kepubify_html_data(src, KOBO_JS_NAME, opts)
-            actual = serialize_html(root).decode('utf-8')
-            actual = actual[len(prefix):-len(suffix)]
-            self.assertEqual(expected, actual)
-            expected = serialize_html(parse(src)).decode('utf-8')
-            opts = opts._replace(for_removal=True)
-            kepubify_parsed_html(root, KOBO_JS_NAME, opts)
-            actual = serialize_html(root).decode('utf-8')
-            self.assertEqual(expected, actual)
+            if isinstance(expected, str):
+                expected = expected, expected
+            perform(src, expected[0], False)
+            perform(src, expected[1], True)