mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
kepubify: Optionally move leading and trailing whitespace out of the kobo spans
This commit is contained in:
parent
3d48a0a94e
commit
6b67ed1e66
@ -67,6 +67,7 @@ class Options(NamedTuple):
|
||||
hyphenation_css: str = ''
|
||||
remove_widows_and_orphans: bool = False
|
||||
remove_at_page_rules: bool = False
|
||||
prefer_justification: bool = False
|
||||
|
||||
for_removal: bool = False
|
||||
|
||||
@ -158,14 +159,19 @@ def unwrap_body_contents(body):
|
||||
body.text = text
|
||||
|
||||
|
||||
def add_kobo_spans(inner, root_lang):
|
||||
def add_kobo_spans(inner, root_lang, prefer_justification=False):
|
||||
stack = []
|
||||
a, p = stack.append, stack.pop
|
||||
a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
|
||||
paranum, segnum = 0, 0
|
||||
increment_next_para = True
|
||||
span_tag_name = XHTML('span')
|
||||
leading_whitespace_pat = re.compile(r'^\s+')
|
||||
lstrip_pat = re.compile(r'^\s+')
|
||||
rstrip_pat = re.compile(r'\s+$')
|
||||
def lstrip(x):
|
||||
return lstrip_pat.sub('', x)
|
||||
def rstrip(x):
|
||||
return rstrip_pat.sub('', x)
|
||||
|
||||
def kobo_span(parent):
|
||||
nonlocal paranum, segnum
|
||||
@ -174,32 +180,46 @@ def add_kobo_spans(inner, root_lang):
|
||||
|
||||
def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
|
||||
nonlocal increment_next_para, paranum, segnum
|
||||
if increment_next_para:
|
||||
paranum += 1
|
||||
segnum = 0
|
||||
increment_next_para = False
|
||||
text_with_leading_whitespace_removed = lstrip(text)
|
||||
try:
|
||||
at = 0 if after_child is None else parent.index(after_child) + 1
|
||||
except ValueError: # wrapped child
|
||||
at = parent.index(after_child.getparent()) + 1
|
||||
stripped = leading_whitespace_pat.sub('', text)
|
||||
if not at and not stripped and not len(parent):
|
||||
stripped = text
|
||||
ws = None
|
||||
if num := len(text) - len(stripped):
|
||||
ws = text[:num]
|
||||
before = None if stripped else ws
|
||||
|
||||
if increment_next_para:
|
||||
paranum += 1
|
||||
segnum = 0
|
||||
increment_next_para = False
|
||||
|
||||
if not at and not text_with_leading_whitespace_removed and not len(parent):
|
||||
# block tag with only whitespace
|
||||
s = kobo_span(parent)
|
||||
s.text = text
|
||||
parent.text = None
|
||||
parent.append(s)
|
||||
return
|
||||
leading_whitespace = None
|
||||
if num := len(text) - len(text_with_leading_whitespace_removed):
|
||||
leading_whitespace = text[:num]
|
||||
before = None if text_with_leading_whitespace_removed and not prefer_justification else leading_whitespace
|
||||
if at:
|
||||
parent[at-1].tail = before
|
||||
else:
|
||||
parent.text = before
|
||||
if stripped:
|
||||
text = (ws + stripped) if ws else stripped
|
||||
for pos, sz in sentence_positions(text, lang):
|
||||
s = kobo_span(parent)
|
||||
s.text = text[pos:pos+sz]
|
||||
parent.insert(at, s)
|
||||
at += 1
|
||||
if not text_with_leading_whitespace_removed:
|
||||
return
|
||||
if not leading_whitespace or prefer_justification:
|
||||
text = text_with_leading_whitespace_removed
|
||||
for pos, sz in sentence_positions(text, lang):
|
||||
s = kobo_span(parent)
|
||||
s.text = inside_span = text[pos:pos+sz]
|
||||
if prefer_justification:
|
||||
inside_span_without_trailing_whitespace = rstrip(inside_span)
|
||||
if tail_len := len(inside_span) - len(inside_span_without_trailing_whitespace):
|
||||
s.tail = inside_span[-tail_len:]
|
||||
s.text = inside_span_without_trailing_whitespace
|
||||
parent.insert(at, s)
|
||||
at += 1
|
||||
|
||||
def wrap_child(child: etree.Element) -> etree.Element:
|
||||
nonlocal increment_next_para, paranum, segnum
|
||||
@ -263,7 +283,7 @@ def add_kobo_markup_to_html(root: etree.Element, kobo_js_href: str, opts: Option
|
||||
add_style_and_script(root, kobo_js_href, opts)
|
||||
for body in XPath('./h:body')(root):
|
||||
inner = wrap_body_contents(body)
|
||||
add_kobo_spans(inner, lang_for_elem(body, root_lang))
|
||||
add_kobo_spans(inner, lang_for_elem(body, root_lang), opts.prefer_justification)
|
||||
|
||||
|
||||
def remove_kobo_markup_from_html(root):
|
||||
|
@ -57,6 +57,19 @@ class KepubifyTests(BaseTest):
|
||||
div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\
|
||||
<body><div id="book-columns"><div id="book-inner">'''
|
||||
suffix = '</div></div></body></html>'
|
||||
|
||||
def perform(src, expected, prefer_justification):
|
||||
opts = Options(remove_widows_and_orphans=True, remove_at_page_rules=True, prefer_justification=prefer_justification)
|
||||
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
|
||||
actual = serialize_html(root).decode('utf-8')
|
||||
actual = actual[len(prefix):-len(suffix)]
|
||||
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
|
||||
expected = serialize_html(parse(src)).decode('utf-8')
|
||||
opts = opts._replace(for_removal=True)
|
||||
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
|
||||
actual = serialize_html(root).decode('utf-8')
|
||||
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
|
||||
|
||||
for src, expected in {
|
||||
# basics
|
||||
'<p>one</p> <p>\xa0</p><p>\xa0<i>a</i></p>':
|
||||
@ -64,7 +77,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
||||
'<p> <i><span class="koboSpan" id="kobo.3.1">a</span></i></p>',
|
||||
|
||||
'<p>Simple sentences. In a single paragraph.'
|
||||
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.':
|
||||
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.': (
|
||||
|
||||
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
|
||||
'<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>'
|
||||
@ -72,6 +85,14 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
||||
'<span class="koboSpan" id="kobo.2.5"> formatting. </span>'
|
||||
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
|
||||
|
||||
# with prefer_justification
|
||||
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences.</span> <span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
|
||||
'<p><span class="koboSpan" id="kobo.2.1">A sentence</span> <i><span class="koboSpan" id="kobo.2.2">with</span>'
|
||||
' <b><span class="koboSpan" id="kobo.2.3">nested</span></b><span class="koboSpan" id="kobo.2.4">, tailed</span></i> '
|
||||
'<span class="koboSpan" id="kobo.2.5">formatting.</span> '
|
||||
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
|
||||
),
|
||||
|
||||
# img tags
|
||||
'<p>An image<img src="x">with tail<img src="b"><i>without':
|
||||
'<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>'
|
||||
@ -85,22 +106,35 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
||||
'<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>',
|
||||
|
||||
# nested block tags
|
||||
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3':
|
||||
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3': (
|
||||
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>'
|
||||
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>'
|
||||
'<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>'
|
||||
'<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>',
|
||||
# with prefer_justification
|
||||
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div> <span class="koboSpan" id="kobo.1.2">nested.</span>'
|
||||
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p> <span class="koboSpan" id="kobo.3.1">with nested block</span></p>'
|
||||
' <span class="koboSpan" id="kobo.3.2">tail1</span></li> <span class="koboSpan" id="kobo.3.3">tail2</span></ul>'
|
||||
' <span class="koboSpan" id="kobo.3.4">tail3</span></div></div>',
|
||||
),
|
||||
|
||||
# skipped tags
|
||||
'<div>Script: <script>a = 1</script> with tail':
|
||||
'<div>Script: <script>a = 1</script> with tail': (
|
||||
'<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>',
|
||||
'<div>Svg: <svg>mouse</svg><i> no tail':
|
||||
'<div><span class="koboSpan" id="kobo.1.1">Script:</span> <script>a = 1</script> <span class="koboSpan" id="kobo.1.2">with tail</span></div>',
|
||||
),
|
||||
'<div>Svg: <svg>mouse</svg><i> no tail': (
|
||||
'<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
|
||||
'<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>',
|
||||
'<div><span class="koboSpan" id="kobo.1.1">Svg:</span> <svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
|
||||
'<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
|
||||
),
|
||||
|
||||
# encoding quirks
|
||||
'<p>A\xa0nbsp; ':
|
||||
'<p>A\xa0nbsp; ': (
|
||||
'<p><span class="koboSpan" id="kobo.1.1">A nbsp; </span></p>',
|
||||
'<p><span class="koboSpan" id="kobo.1.1">A nbsp;</span> </p>',
|
||||
),
|
||||
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesn't do this
|
||||
'<div><script><![CDATA[1 < 2 & 3]]></script></div>',
|
||||
|
||||
@ -110,13 +144,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
||||
f'div {{\n -{CSS_COMMENT_COOKIE}-widows: 12;\n color: red;\n}}</style>'
|
||||
'<span class="koboSpan" id="kobo.1.1">Some</span></div>'
|
||||
}.items():
|
||||
opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True)
|
||||
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
|
||||
actual = serialize_html(root).decode('utf-8')
|
||||
actual = actual[len(prefix):-len(suffix)]
|
||||
self.assertEqual(expected, actual)
|
||||
expected = serialize_html(parse(src)).decode('utf-8')
|
||||
opts = opts._replace(for_removal=True)
|
||||
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
|
||||
actual = serialize_html(root).decode('utf-8')
|
||||
self.assertEqual(expected, actual)
|
||||
if isinstance(expected, str):
|
||||
expected = expected, expected
|
||||
perform(src, expected[0], False)
|
||||
perform(src, expected[1], True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user