mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
kepubify: Optionally move leading and trailing whitespace out of the kobo spans
This commit is contained in:
parent
3d48a0a94e
commit
6b67ed1e66
@ -67,6 +67,7 @@ class Options(NamedTuple):
|
|||||||
hyphenation_css: str = ''
|
hyphenation_css: str = ''
|
||||||
remove_widows_and_orphans: bool = False
|
remove_widows_and_orphans: bool = False
|
||||||
remove_at_page_rules: bool = False
|
remove_at_page_rules: bool = False
|
||||||
|
prefer_justification: bool = False
|
||||||
|
|
||||||
for_removal: bool = False
|
for_removal: bool = False
|
||||||
|
|
||||||
@ -158,14 +159,19 @@ def unwrap_body_contents(body):
|
|||||||
body.text = text
|
body.text = text
|
||||||
|
|
||||||
|
|
||||||
def add_kobo_spans(inner, root_lang):
|
def add_kobo_spans(inner, root_lang, prefer_justification=False):
|
||||||
stack = []
|
stack = []
|
||||||
a, p = stack.append, stack.pop
|
a, p = stack.append, stack.pop
|
||||||
a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
|
a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
|
||||||
paranum, segnum = 0, 0
|
paranum, segnum = 0, 0
|
||||||
increment_next_para = True
|
increment_next_para = True
|
||||||
span_tag_name = XHTML('span')
|
span_tag_name = XHTML('span')
|
||||||
leading_whitespace_pat = re.compile(r'^\s+')
|
lstrip_pat = re.compile(r'^\s+')
|
||||||
|
rstrip_pat = re.compile(r'\s+$')
|
||||||
|
def lstrip(x):
|
||||||
|
return lstrip_pat.sub('', x)
|
||||||
|
def rstrip(x):
|
||||||
|
return rstrip_pat.sub('', x)
|
||||||
|
|
||||||
def kobo_span(parent):
|
def kobo_span(parent):
|
||||||
nonlocal paranum, segnum
|
nonlocal paranum, segnum
|
||||||
@ -174,32 +180,46 @@ def add_kobo_spans(inner, root_lang):
|
|||||||
|
|
||||||
def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
|
def wrap_text_in_spans(text: str, parent: etree.Element, after_child: etree.ElementBase, lang: str) -> str | None:
|
||||||
nonlocal increment_next_para, paranum, segnum
|
nonlocal increment_next_para, paranum, segnum
|
||||||
if increment_next_para:
|
text_with_leading_whitespace_removed = lstrip(text)
|
||||||
paranum += 1
|
|
||||||
segnum = 0
|
|
||||||
increment_next_para = False
|
|
||||||
try:
|
try:
|
||||||
at = 0 if after_child is None else parent.index(after_child) + 1
|
at = 0 if after_child is None else parent.index(after_child) + 1
|
||||||
except ValueError: # wrapped child
|
except ValueError: # wrapped child
|
||||||
at = parent.index(after_child.getparent()) + 1
|
at = parent.index(after_child.getparent()) + 1
|
||||||
stripped = leading_whitespace_pat.sub('', text)
|
|
||||||
if not at and not stripped and not len(parent):
|
if increment_next_para:
|
||||||
stripped = text
|
paranum += 1
|
||||||
ws = None
|
segnum = 0
|
||||||
if num := len(text) - len(stripped):
|
increment_next_para = False
|
||||||
ws = text[:num]
|
|
||||||
before = None if stripped else ws
|
if not at and not text_with_leading_whitespace_removed and not len(parent):
|
||||||
|
# block tag with only whitespace
|
||||||
|
s = kobo_span(parent)
|
||||||
|
s.text = text
|
||||||
|
parent.text = None
|
||||||
|
parent.append(s)
|
||||||
|
return
|
||||||
|
leading_whitespace = None
|
||||||
|
if num := len(text) - len(text_with_leading_whitespace_removed):
|
||||||
|
leading_whitespace = text[:num]
|
||||||
|
before = None if text_with_leading_whitespace_removed and not prefer_justification else leading_whitespace
|
||||||
if at:
|
if at:
|
||||||
parent[at-1].tail = before
|
parent[at-1].tail = before
|
||||||
else:
|
else:
|
||||||
parent.text = before
|
parent.text = before
|
||||||
if stripped:
|
if not text_with_leading_whitespace_removed:
|
||||||
text = (ws + stripped) if ws else stripped
|
return
|
||||||
for pos, sz in sentence_positions(text, lang):
|
if not leading_whitespace or prefer_justification:
|
||||||
s = kobo_span(parent)
|
text = text_with_leading_whitespace_removed
|
||||||
s.text = text[pos:pos+sz]
|
for pos, sz in sentence_positions(text, lang):
|
||||||
parent.insert(at, s)
|
s = kobo_span(parent)
|
||||||
at += 1
|
s.text = inside_span = text[pos:pos+sz]
|
||||||
|
if prefer_justification:
|
||||||
|
inside_span_without_trailing_whitespace = rstrip(inside_span)
|
||||||
|
if tail_len := len(inside_span) - len(inside_span_without_trailing_whitespace):
|
||||||
|
s.tail = inside_span[-tail_len:]
|
||||||
|
s.text = inside_span_without_trailing_whitespace
|
||||||
|
parent.insert(at, s)
|
||||||
|
at += 1
|
||||||
|
|
||||||
def wrap_child(child: etree.Element) -> etree.Element:
|
def wrap_child(child: etree.Element) -> etree.Element:
|
||||||
nonlocal increment_next_para, paranum, segnum
|
nonlocal increment_next_para, paranum, segnum
|
||||||
@ -263,7 +283,7 @@ def add_kobo_markup_to_html(root: etree.Element, kobo_js_href: str, opts: Option
|
|||||||
add_style_and_script(root, kobo_js_href, opts)
|
add_style_and_script(root, kobo_js_href, opts)
|
||||||
for body in XPath('./h:body')(root):
|
for body in XPath('./h:body')(root):
|
||||||
inner = wrap_body_contents(body)
|
inner = wrap_body_contents(body)
|
||||||
add_kobo_spans(inner, lang_for_elem(body, root_lang))
|
add_kobo_spans(inner, lang_for_elem(body, root_lang), opts.prefer_justification)
|
||||||
|
|
||||||
|
|
||||||
def remove_kobo_markup_from_html(root):
|
def remove_kobo_markup_from_html(root):
|
||||||
|
@ -57,6 +57,19 @@ class KepubifyTests(BaseTest):
|
|||||||
div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\
|
div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/javascript" src="{KOBO_JS_NAME}"/></head>\
|
||||||
<body><div id="book-columns"><div id="book-inner">'''
|
<body><div id="book-columns"><div id="book-inner">'''
|
||||||
suffix = '</div></div></body></html>'
|
suffix = '</div></div></body></html>'
|
||||||
|
|
||||||
|
def perform(src, expected, prefer_justification):
|
||||||
|
opts = Options(remove_widows_and_orphans=True, remove_at_page_rules=True, prefer_justification=prefer_justification)
|
||||||
|
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
|
||||||
|
actual = serialize_html(root).decode('utf-8')
|
||||||
|
actual = actual[len(prefix):-len(suffix)]
|
||||||
|
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
|
||||||
|
expected = serialize_html(parse(src)).decode('utf-8')
|
||||||
|
opts = opts._replace(for_removal=True)
|
||||||
|
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
|
||||||
|
actual = serialize_html(root).decode('utf-8')
|
||||||
|
self.assertEqual(expected, actual, f'\n\nText:\n{src}\n\nExpected:\n{expected}\n\nActual:\n{actual}')
|
||||||
|
|
||||||
for src, expected in {
|
for src, expected in {
|
||||||
# basics
|
# basics
|
||||||
'<p>one</p> <p>\xa0</p><p>\xa0<i>a</i></p>':
|
'<p>one</p> <p>\xa0</p><p>\xa0<i>a</i></p>':
|
||||||
@ -64,7 +77,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
|||||||
'<p> <i><span class="koboSpan" id="kobo.3.1">a</span></i></p>',
|
'<p> <i><span class="koboSpan" id="kobo.3.1">a</span></i></p>',
|
||||||
|
|
||||||
'<p>Simple sentences. In a single paragraph.'
|
'<p>Simple sentences. In a single paragraph.'
|
||||||
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.':
|
'<p>A sentence <i>with <b>nested</b>, tailed</i> formatting. Another.': (
|
||||||
|
|
||||||
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
|
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences. </span><span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
|
||||||
'<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>'
|
'<p><span class="koboSpan" id="kobo.2.1">A sentence </span><i><span class="koboSpan" id="kobo.2.2">with </span>'
|
||||||
@ -72,6 +85,14 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
|||||||
'<span class="koboSpan" id="kobo.2.5"> formatting. </span>'
|
'<span class="koboSpan" id="kobo.2.5"> formatting. </span>'
|
||||||
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
|
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
|
||||||
|
|
||||||
|
# with prefer_justification
|
||||||
|
'<p><span class="koboSpan" id="kobo.1.1">Simple sentences.</span> <span class="koboSpan" id="kobo.1.2">In a single paragraph.</span></p>'
|
||||||
|
'<p><span class="koboSpan" id="kobo.2.1">A sentence</span> <i><span class="koboSpan" id="kobo.2.2">with</span>'
|
||||||
|
' <b><span class="koboSpan" id="kobo.2.3">nested</span></b><span class="koboSpan" id="kobo.2.4">, tailed</span></i> '
|
||||||
|
'<span class="koboSpan" id="kobo.2.5">formatting.</span> '
|
||||||
|
'<span class="koboSpan" id="kobo.2.6">Another.</span></p>',
|
||||||
|
),
|
||||||
|
|
||||||
# img tags
|
# img tags
|
||||||
'<p>An image<img src="x">with tail<img src="b"><i>without':
|
'<p>An image<img src="x">with tail<img src="b"><i>without':
|
||||||
'<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>'
|
'<p><span class="koboSpan" id="kobo.1.1">An image</span><span class="koboSpan" id="kobo.2.1"><img src="x"/></span>'
|
||||||
@ -85,22 +106,35 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
|||||||
'<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>',
|
'<p><span class="koboSpan" id="kobo.2.1">A comment</span><!-- xx --><i><span class="koboSpan" id="kobo.2.2">without tail</span></i></p>',
|
||||||
|
|
||||||
# nested block tags
|
# nested block tags
|
||||||
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3':
|
'<div>A div<div> nested.<ul><li>A list<p> with nested block</p> tail1</li> tail2</ul> tail3': (
|
||||||
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>'
|
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div><span class="koboSpan" id="kobo.1.2"> nested.</span>'
|
||||||
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>'
|
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p><span class="koboSpan" id="kobo.3.1"> with nested block</span></p>'
|
||||||
'<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>'
|
'<span class="koboSpan" id="kobo.3.2"> tail1</span></li><span class="koboSpan" id="kobo.3.3"> tail2</span></ul>'
|
||||||
'<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>',
|
'<span class="koboSpan" id="kobo.3.4"> tail3</span></div></div>',
|
||||||
|
# with prefer_justification
|
||||||
|
'<div><span class="koboSpan" id="kobo.1.1">A div</span><div> <span class="koboSpan" id="kobo.1.2">nested.</span>'
|
||||||
|
'<ul><li><span class="koboSpan" id="kobo.2.1">A list</span><p> <span class="koboSpan" id="kobo.3.1">with nested block</span></p>'
|
||||||
|
' <span class="koboSpan" id="kobo.3.2">tail1</span></li> <span class="koboSpan" id="kobo.3.3">tail2</span></ul>'
|
||||||
|
' <span class="koboSpan" id="kobo.3.4">tail3</span></div></div>',
|
||||||
|
),
|
||||||
|
|
||||||
# skipped tags
|
# skipped tags
|
||||||
'<div>Script: <script>a = 1</script> with tail':
|
'<div>Script: <script>a = 1</script> with tail': (
|
||||||
'<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>',
|
'<div><span class="koboSpan" id="kobo.1.1">Script: </span><script>a = 1</script><span class="koboSpan" id="kobo.1.2"> with tail</span></div>',
|
||||||
'<div>Svg: <svg>mouse</svg><i> no tail':
|
'<div><span class="koboSpan" id="kobo.1.1">Script:</span> <script>a = 1</script> <span class="koboSpan" id="kobo.1.2">with tail</span></div>',
|
||||||
|
),
|
||||||
|
'<div>Svg: <svg>mouse</svg><i> no tail': (
|
||||||
'<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
|
'<div><span class="koboSpan" id="kobo.1.1">Svg: </span><svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
|
||||||
'<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>',
|
'<i><span class="koboSpan" id="kobo.1.2"> no tail</span></i></div>',
|
||||||
|
'<div><span class="koboSpan" id="kobo.1.1">Svg:</span> <svg xmlns="http://www.w3.org/2000/svg">mouse</svg>'
|
||||||
|
'<i> <span class="koboSpan" id="kobo.1.2">no tail</span></i></div>',
|
||||||
|
),
|
||||||
|
|
||||||
# encoding quirks
|
# encoding quirks
|
||||||
'<p>A\xa0nbsp; ':
|
'<p>A\xa0nbsp; ': (
|
||||||
'<p><span class="koboSpan" id="kobo.1.1">A nbsp; </span></p>',
|
'<p><span class="koboSpan" id="kobo.1.1">A nbsp; </span></p>',
|
||||||
|
'<p><span class="koboSpan" id="kobo.1.1">A nbsp;</span> </p>',
|
||||||
|
),
|
||||||
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesn't do this
|
'<div><script>1 < 2 & 3</script>': # escaping with cdata note that kepubify doesn't do this
|
||||||
'<div><script><![CDATA[1 < 2 & 3]]></script></div>',
|
'<div><script><![CDATA[1 < 2 & 3]]></script></div>',
|
||||||
|
|
||||||
@ -110,13 +144,7 @@ div#book-inner {{ margin-top: 0; margin-bottom: 0; }}</style><script type="text/
|
|||||||
f'div {{\n -{CSS_COMMENT_COOKIE}-widows: 12;\n color: red;\n}}</style>'
|
f'div {{\n -{CSS_COMMENT_COOKIE}-widows: 12;\n color: red;\n}}</style>'
|
||||||
'<span class="koboSpan" id="kobo.1.1">Some</span></div>'
|
'<span class="koboSpan" id="kobo.1.1">Some</span></div>'
|
||||||
}.items():
|
}.items():
|
||||||
opts = Options()._replace(remove_widows_and_orphans=True, remove_at_page_rules=True)
|
if isinstance(expected, str):
|
||||||
root = kepubify_html_data(src, KOBO_JS_NAME, opts)
|
expected = expected, expected
|
||||||
actual = serialize_html(root).decode('utf-8')
|
perform(src, expected[0], False)
|
||||||
actual = actual[len(prefix):-len(suffix)]
|
perform(src, expected[1], True)
|
||||||
self.assertEqual(expected, actual)
|
|
||||||
expected = serialize_html(parse(src)).decode('utf-8')
|
|
||||||
opts = opts._replace(for_removal=True)
|
|
||||||
kepubify_parsed_html(root, KOBO_JS_NAME, opts)
|
|
||||||
actual = serialize_html(root).decode('utf-8')
|
|
||||||
self.assertEqual(expected, actual)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user