kepubify: Optionally move leading and trailing whitespace out of the kobo spans

This commit is contained in:
Kovid Goyal 2025-05-06 11:55:47 +05:30
parent 3d48a0a94e
commit 6b67ed1e66
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 84 additions and 36 deletions

View File

@ -67,6 +67,7 @@ class Options(NamedTuple):
hyphenation_css: str = ''
remove_widows_and_orphans: bool = False
remove_at_page_rules: bool = False
prefer_justification: bool = False
for_removal: bool = False
@ -158,14 +159,19 @@ def unwrap_body_contents(body):
body.text = text
def add_kobo_spans(inner, root_lang):
def add_kobo_spans(inner, root_lang, prefer_justification=False):
stack = []
a, p = stack.append, stack.pop
a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
paranum, segnum = 0, 0
increment_next_para = True
span_tag_name = XHTML('span')
leading_whitespace_pat = re.compile(r'^\s+')
lstrip_pat = re.compile(r'^\s+')
rstrip_pat = re.compile(r'\s+$')
def lstrip(x):
return lstrip_pat.sub('', x)
def rstrip(x):
return rstrip_pat.sub('', x)
def kobo_span(parent):
nonlocal paranum, segnum
@ -174,30 +180,44 @@ def add_kobo_spans(inner, root_lang):
def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
nonlocal increment_next_para, paranum, segnum
if increment_next_para:
paranum += 1
segnum = 0
increment_next_para = False
text_with_leading_whitespace_removed = lstrip(text)
try:
at = 0 if after_child is None else parent.index(after_child) + 1
except ValueError: # wrapped child
at = parent.index(after_child.getparent()) + 1
stripped = leading_whitespace_pat.sub('', text)
if not at and not stripped and not len(parent):
stripped = text
ws = None
if num := len(text) - len(stripped):
ws = text[:num]
before = None if stripped else ws
if increment_next_para:
paranum += 1
segnum = 0
increment_next_para = False
if not at and not text_with_leading_whitespace_removed and not len(parent):
# block tag with only whitespace
s = kobo_span(parent)
s.text = text
parent.text = None
parent.append(s)
return
leading_whitespace = None
if num := len(text) - len(text_with_leading_whitespace_removed):
leading_whitespace = text[:num]
before = None if text_with_leading_whitespace_removed and not prefer_justification else leading_whitespace
if at:
parent[at-1].tail = before
else:
parent.text = before
if stripped:
text = (ws + stripped) if ws else stripped
if not text_with_leading_whitespace_removed:
return
if not leading_whitespace or prefer_justification:
text = text_with_leading_whitespace_removed
for pos, sz in sentence_positions(text, lang):
s = kobo_span(parent)
s.text = text[pos:pos+sz]
s.text = inside_span = text[pos:pos+sz]
if prefer_justification:
inside_span_without_trailing_whitespace = rstrip(inside_span)
if tail_len := len(inside_span) - len(inside_span_without_trailing_whitespace):
s.tail = inside_span[-tail_len:]
s.text = inside_span_without_trailing_whitespace
parent.insert(at, s)
at += 1
@ -263,7 +283,7 @@ def add_kobo_markup_to_html(root: etree.Element, kobo_js_href: str, opts: Option
add_style_and_script(root, kobo_js_href, opts)
for body in XPath('./h:body')(root):
inner = wrap_body_contents(body)
add_kobo_spans(inner, lang_for_elem(body, root_lang))
add_kobo_spans(inner, lang_for_elem(body, root_lang), opts.prefer_justification)
def remove_kobo_markup_from_html(root):

View File

@ -57,6 +57,19 @@ class KepubifyTests(BaseTest):
div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\
<body><div id="book-columns"><div id="book-inner">'''
suffix = '</div></div></body></html>'
def perform(src, expected, prefer_justification):
opts = Options(remove_widows_and_orphans=True, remove_at_page_rules=True, prefer_justification=prefer_justification)
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
actual = actual[len(prefix):-len(suffix)]
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
expected = serialize_html(parse(src)).decode('utf-8')
opts = opts._replace(for_removal=True)
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
for src, expected in {
# basics
'<p>one</p> <p>\xa0</p><p>\xa0<i>a</i></p>':
@ -64,7 +77,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
'<p>&#160;<i><span class="koboSpan" id="kobo.3.1">a</span></i></p>',
'<p>Simple sentences. In a single paragraph.'
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.':
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.': (
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
'<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>'
@ -72,6 +85,14 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
'<span class="koboSpan" id="kobo.2.5"> formatting. </span>'
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
# with prefer_justification
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences.</span> <span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
'<p><span class="koboSpan" id="kobo.2.1">A sentence</span> <i><span class="koboSpan" id="kobo.2.2">with</span>'
' <b><span class="koboSpan" id="kobo.2.3">nested</span></b><span class="koboSpan" id="kobo.2.4">, tailed</span></i> '
'<span class="koboSpan" id="kobo.2.5">formatting.</span> '
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
),
# img tags
'<p>An image<img src="x">with tail<img src="b"><i>without':
'<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>'
@ -85,22 +106,35 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
'<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>',
# nested block tags
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3':
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3': (
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>'
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>'
'<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>'
'<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>',
# with prefer_justification
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div> <span class="koboSpan" id="kobo.1.2">nested.</span>'
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p> <span class="koboSpan" id="kobo.3.1">with nested block</span></p>'
' <span class="koboSpan" id="kobo.3.2">tail1</span></li> <span class="koboSpan" id="kobo.3.3">tail2</span></ul>'
' <span class="koboSpan" id="kobo.3.4">tail3</span></div></div>',
),
# skipped tags
'<div>Script: <script>a = 1</script> with tail':
'<div>Script: <script>a = 1</script> with tail': (
'<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>',
'<div>Svg: <svg>mouse</svg><i> no tail':
'<div><span class="koboSpan" id="kobo.1.1">Script:</span> <script>a = 1</script> <span class="koboSpan" id="kobo.1.2">with tail</span></div>',
),
'<div>Svg: <svg>mouse</svg><i> no tail': (
'<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
'<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>',
'<div><span class="koboSpan" id="kobo.1.1">Svg:</span> <svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
'<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
),
# encoding quirks
'<p>A\xa0nbsp;&nbsp;':
'<p>A\xa0nbsp;&nbsp;': (
'<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>',
'<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;</span>&#160;</p>',
),
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesn't do this
'<div><script><![CDATA[1 < 2 & 3]]></script></div>',
@ -110,13 +144,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
f'div {{\n -{CSS_COMMENT_COOKIE}-widows: 12;\n color: red;\n}}</style>'
'<span class="koboSpan" id="kobo.1.1">Some</span></div>'
}.items():
opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True)
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
actual = actual[len(prefix):-len(suffix)]
self.assertEqual(expected, actual)
expected = serialize_html(parse(src)).decode('utf-8')
opts = opts._replace(for_removal=True)
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
self.assertEqual(expected, actual)
if isinstance(expected, str):
expected = expected, expected
perform(src, expected[0], False)
perform(src, expected[1], True)