kepubify: Optionally move leading and trailing whitespace out of the kobo spans

This commit is contained in:
Kovid Goyal 2025-05-06 11:55:47 +05:30
parent 3d48a0a94e
commit 6b67ed1e66
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 84 additions and 36 deletions

View File

@ -67,6 +67,7 @@ class Options(NamedTuple):
hyphenation_css: str = '' hyphenation_css: str = ''
remove_widows_and_orphans: bool = False remove_widows_and_orphans: bool = False
remove_at_page_rules: bool = False remove_at_page_rules: bool = False
prefer_justification: bool = False
for_removal: bool = False for_removal: bool = False
@ -158,14 +159,19 @@ def unwrap_body_contents(body):
body.text = text body.text = text
def add_kobo_spans(inner, root_lang): def add_kobo_spans(inner, root_lang, prefer_justification=False):
stack = [] stack = []
a, p = stack.append, stack.pop a, p = stack.append, stack.pop
a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang))) a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
paranum, segnum = 0, 0 paranum, segnum = 0, 0
increment_next_para = True increment_next_para = True
span_tag_name = XHTML('span') span_tag_name = XHTML('span')
leading_whitespace_pat = re.compile(r'^\s+') lstrip_pat = re.compile(r'^\s+')
rstrip_pat = re.compile(r'\s+$')
def lstrip(x):
return lstrip_pat.sub('', x)
def rstrip(x):
return rstrip_pat.sub('', x)
def kobo_span(parent): def kobo_span(parent):
nonlocal paranum, segnum nonlocal paranum, segnum
@ -174,32 +180,46 @@ def add_kobo_spans(inner, root_lang):
def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None: def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
nonlocal increment_next_para, paranum, segnum nonlocal increment_next_para, paranum, segnum
if increment_next_para: text_with_leading_whitespace_removed = lstrip(text)
paranum += 1
segnum = 0
increment_next_para = False
try: try:
at = 0 if after_child is None else parent.index(after_child) + 1 at = 0 if after_child is None else parent.index(after_child) + 1
except ValueError: # wrapped child except ValueError: # wrapped child
at = parent.index(after_child.getparent()) + 1 at = parent.index(after_child.getparent()) + 1
stripped = leading_whitespace_pat.sub('', text)
if not at and not stripped and not len(parent): if increment_next_para:
stripped = text paranum += 1
ws = None segnum = 0
if num := len(text) - len(stripped): increment_next_para = False
ws = text[:num]
before = None if stripped else ws if not at and not text_with_leading_whitespace_removed and not len(parent):
# block tag with only whitespace
s = kobo_span(parent)
s.text = text
parent.text = None
parent.append(s)
return
leading_whitespace = None
if num := len(text) - len(text_with_leading_whitespace_removed):
leading_whitespace = text[:num]
before = None if text_with_leading_whitespace_removed and not prefer_justification else leading_whitespace
if at: if at:
parent[at-1].tail = before parent[at-1].tail = before
else: else:
parent.text = before parent.text = before
if stripped: if not text_with_leading_whitespace_removed:
text = (ws + stripped) if ws else stripped return
for pos, sz in sentence_positions(text, lang): if not leading_whitespace or prefer_justification:
s = kobo_span(parent) text = text_with_leading_whitespace_removed
s.text = text[pos:pos+sz] for pos, sz in sentence_positions(text, lang):
parent.insert(at, s) s = kobo_span(parent)
at += 1 s.text = inside_span = text[pos:pos+sz]
if prefer_justification:
inside_span_without_trailing_whitespace = rstrip(inside_span)
if tail_len := len(inside_span) - len(inside_span_without_trailing_whitespace):
s.tail = inside_span[-tail_len:]
s.text = inside_span_without_trailing_whitespace
parent.insert(at, s)
at += 1
def wrap_child(child: etree.Element) -> etree.Element: def wrap_child(child: etree.Element) -> etree.Element:
nonlocal increment_next_para, paranum, segnum nonlocal increment_next_para, paranum, segnum
@ -263,7 +283,7 @@ def add_kobo_markup_to_html(root: etree.Element, kobo_js_href: str, opts: Option
add_style_and_script(root, kobo_js_href, opts) add_style_and_script(root, kobo_js_href, opts)
for body in XPath('./h:body')(root): for body in XPath('./h:body')(root):
inner = wrap_body_contents(body) inner = wrap_body_contents(body)
add_kobo_spans(inner, lang_for_elem(body, root_lang)) add_kobo_spans(inner, lang_for_elem(body, root_lang), opts.prefer_justification)
def remove_kobo_markup_from_html(root): def remove_kobo_markup_from_html(root):

View File

@ -57,6 +57,19 @@ class KepubifyTests(BaseTest):
div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\
<body><div id="book-columns"><div id="book-inner">''' <body><div id="book-columns"><div id="book-inner">'''
suffix = '</div></div></body></html>' suffix = '</div></div></body></html>'
def perform(src, expected, prefer_justification):
opts = Options(remove_widows_and_orphans=True, remove_at_page_rules=True, prefer_justification=prefer_justification)
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
actual = actual[len(prefix):-len(suffix)]
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
expected = serialize_html(parse(src)).decode('utf-8')
opts = opts._replace(for_removal=True)
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
for src, expected in { for src, expected in {
# basics # basics
'<p>one</p> <p>\xa0</p><p>\xa0<i>a</i></p>': '<p>one</p> <p>\xa0</p><p>\xa0<i>a</i></p>':
@ -64,7 +77,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
'<p>&#160;<i><span class="koboSpan" id="kobo.3.1">a</span></i></p>', '<p>&#160;<i><span class="koboSpan" id="kobo.3.1">a</span></i></p>',
'<p>Simple sentences. In a single paragraph.' '<p>Simple sentences. In a single paragraph.'
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.': '<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.': (
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>' '<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
'<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>' '<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>'
@ -72,6 +85,14 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
'<span class="koboSpan" id="kobo.2.5"> formatting. </span>' '<span class="koboSpan" id="kobo.2.5"> formatting. </span>'
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>', '<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
# with prefer_justification
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences.</span> <span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
'<p><span class="koboSpan" id="kobo.2.1">A sentence</span> <i><span class="koboSpan" id="kobo.2.2">with</span>'
' <b><span class="koboSpan" id="kobo.2.3">nested</span></b><span class="koboSpan" id="kobo.2.4">, tailed</span></i> '
'<span class="koboSpan" id="kobo.2.5">formatting.</span> '
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
),
# img tags # img tags
'<p>An image<img src="x">with tail<img src="b"><i>without': '<p>An image<img src="x">with tail<img src="b"><i>without':
'<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>' '<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>'
@ -85,22 +106,35 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
'<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>', '<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>',
# nested block tags # nested block tags
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3': '<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3': (
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>' '<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>'
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>' '<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>'
'<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>' '<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>'
'<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>', '<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>',
# with prefer_justification
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div> <span class="koboSpan" id="kobo.1.2">nested.</span>'
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p> <span class="koboSpan" id="kobo.3.1">with nested block</span></p>'
' <span class="koboSpan" id="kobo.3.2">tail1</span></li> <span class="koboSpan" id="kobo.3.3">tail2</span></ul>'
' <span class="koboSpan" id="kobo.3.4">tail3</span></div></div>',
),
# skipped tags # skipped tags
'<div>Script: <script>a = 1</script> with tail': '<div>Script: <script>a = 1</script> with tail': (
'<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>', '<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>',
'<div>Svg: <svg>mouse</svg><i> no tail': '<div><span class="koboSpan" id="kobo.1.1">Script:</span> <script>a = 1</script> <span class="koboSpan" id="kobo.1.2">with tail</span></div>',
),
'<div>Svg: <svg>mouse</svg><i> no tail': (
'<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>' '<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
'<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>', '<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>',
'<div><span class="koboSpan" id="kobo.1.1">Svg:</span> <svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
'<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
),
# encoding quirks # encoding quirks
'<p>A\xa0nbsp;&nbsp;': '<p>A\xa0nbsp;&nbsp;': (
'<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>', '<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;&#160;</span></p>',
'<p><span class="koboSpan" id="kobo.1.1">A&#160;nbsp;</span>&#160;</p>',
),
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesn't do this '<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesn't do this
'<div><script><![CDATA[1 < 2 & 3]]></script></div>', '<div><script><![CDATA[1 < 2 & 3]]></script></div>',
@ -110,13 +144,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
f'div {{\n -{CSS_COMMENT_COOKIE}-widows: 12;\n color: red;\n}}</style>' f'div {{\n -{CSS_COMMENT_COOKIE}-widows: 12;\n color: red;\n}}</style>'
'<span class="koboSpan" id="kobo.1.1">Some</span></div>' '<span class="koboSpan" id="kobo.1.1">Some</span></div>'
}.items(): }.items():
opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True) if isinstance(expected, str):
root = kepubify_html_data(src, KOBO_JS_NAME, opts) expected = expected, expected
actual = serialize_html(root).decode('utf-8') perform(src, expected[0], False)
actual = actual[len(prefix):-len(suffix)] perform(src, expected[1], True)
self.assertEqual(expected, actual)
expected = serialize_html(parse(src)).decode('utf-8')
opts = opts._replace(for_removal=True)
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
actual = serialize_html(root).decode('utf-8')
self.assertEqual(expected, actual)