mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
More work on kepubify
This commit is contained in:
parent
f5ff33a926
commit
41b0ca1461
@ -12,16 +12,27 @@
|
|||||||
# * Cover marking in the OPF
|
# * Cover marking in the OPF
|
||||||
# * Markup cleanup (remove various things that trip up the Kobo renderer)
|
# * Markup cleanup (remove various things that trip up the Kobo renderer)
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML, XPath
|
from calibre.ebooks.oeb.base import XHTML, XPath
|
||||||
from calibre.ebooks.oeb.parse_utils import merge_multiple_html_heads_and_bodies
|
from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
|
||||||
|
from calibre.ebooks.oeb.polish.tts import lang_for_elem
|
||||||
from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
|
from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
|
||||||
|
from calibre.spell.break_iterator import sentence_positions
|
||||||
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
|
|
||||||
KOBO_STYLE_HACKS = 'kobostylehacks'
|
KOBO_STYLE_HACKS = 'kobostylehacks'
|
||||||
OUTER_DIV_ID = 'book-columns'
|
OUTER_DIV_ID = 'book-columns'
|
||||||
INNER_DIV_ID = 'book-inner'
|
INNER_DIV_ID = 'book-inner'
|
||||||
|
SKIPPED_TAGS = frozenset((
|
||||||
|
'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
|
||||||
|
))
|
||||||
|
BLOCK_TAGS = frozenset((
|
||||||
|
'p', 'ol', 'ul', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0;}', cls=KOBO_STYLE_HACKS) -> bool:
|
def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0; }', cls=KOBO_STYLE_HACKS) -> bool:
|
||||||
|
|
||||||
def add(parent):
|
def add(parent):
|
||||||
e = parent.makeelement(XHTML('style'), type='text/css')
|
e = parent.makeelement(XHTML('style'), type='text/css')
|
||||||
@ -44,7 +55,7 @@ def remove_kobo_styles(root):
|
|||||||
|
|
||||||
|
|
||||||
def wrap_body_contents(body):
|
def wrap_body_contents(body):
|
||||||
# To match official KEPUBs. Kobo wraps the body with two div tags,
|
# Kobo wraps the body with two div tags,
|
||||||
# div#book-columns > div#book-inner to provide a target for applying
|
# div#book-columns > div#book-inner to provide a target for applying
|
||||||
# pagination styles.
|
# pagination styles.
|
||||||
for elem in XPath(f'//*[@id="{OUTER_DIV_ID}" or @id={INNER_DIV_ID}]')(body):
|
for elem in XPath(f'//*[@id="{OUTER_DIV_ID}" or @id={INNER_DIV_ID}]')(body):
|
||||||
@ -58,6 +69,7 @@ def wrap_body_contents(body):
|
|||||||
inner.append(child)
|
inner.append(child)
|
||||||
del body[:]
|
del body[:]
|
||||||
body.append(outer)
|
body.append(outer)
|
||||||
|
return inner
|
||||||
|
|
||||||
|
|
||||||
def unwrap_body_contents(body):
|
def unwrap_body_contents(body):
|
||||||
@ -72,10 +84,71 @@ def unwrap_body_contents(body):
|
|||||||
body.text = text
|
body.text = text
|
||||||
|
|
||||||
|
|
||||||
def add_kobo_markup_to_html(root):
|
def add_kobo_spans(inner, root_lang):
|
||||||
|
stack = []
|
||||||
|
a, p = stack.append, stack.pop
|
||||||
|
a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
|
||||||
|
paranum, segnum = 0, 0
|
||||||
|
increment_next_para = True
|
||||||
|
span_tag_name = XHTML('span')
|
||||||
|
leading_whitespace_pat = re.compile(r'^\s+')
|
||||||
|
|
||||||
|
def kobo_span(parent):
|
||||||
|
nonlocal paranum, segnum
|
||||||
|
segnum += 1
|
||||||
|
return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'})
|
||||||
|
|
||||||
|
def wrap_text_in_spans(text: str, parent, at: int, lang: str) -> str | None:
|
||||||
|
nonlocal increment_next_para, paranum, segnum
|
||||||
|
if increment_next_para:
|
||||||
|
paranum += 1
|
||||||
|
segnum = 0
|
||||||
|
increment_next_para = False
|
||||||
|
stripped = leading_whitespace_pat.sub('', text)
|
||||||
|
ws = None
|
||||||
|
if num := len(text) - len(stripped):
|
||||||
|
ws = text[:num]
|
||||||
|
if at:
|
||||||
|
parent[at-1].tail = ws
|
||||||
|
else:
|
||||||
|
parent.text = ws
|
||||||
|
for pos, sz in sentence_positions(stripped, lang):
|
||||||
|
s = kobo_span(parent)
|
||||||
|
s.text = stripped[pos:pos+sz]
|
||||||
|
parent.insert(at, s)
|
||||||
|
|
||||||
|
while stack:
|
||||||
|
node, parent, tagname, node_lang = p()
|
||||||
|
if parent is not None:
|
||||||
|
wrap_text_in_spans(node, parent, tagname, node_lang)
|
||||||
|
continue
|
||||||
|
if not increment_next_para and tagname in BLOCK_TAGS:
|
||||||
|
increment_next_para = True
|
||||||
|
if node.text:
|
||||||
|
wrap_text_in_spans(node.text, node, 0, node_lang)
|
||||||
|
for i, child in enumerate(reversed(node)):
|
||||||
|
i = len(node) - 1 - i
|
||||||
|
if child.tail:
|
||||||
|
a((child.tail, node, i + 1, node_lang))
|
||||||
|
if isinstance(child.tag, 'str'):
|
||||||
|
child_name = barename(child.tag).lower()
|
||||||
|
if child_name == 'img':
|
||||||
|
increment_next_para = False
|
||||||
|
paranum += 1
|
||||||
|
segnum = 0
|
||||||
|
w = kobo_span(node)
|
||||||
|
w.append(child)
|
||||||
|
node[i] = w
|
||||||
|
elif child_name not in SKIPPED_TAGS:
|
||||||
|
a((child, None, child_name, lang_for_elem(child, node_lang)))
|
||||||
|
|
||||||
|
|
||||||
|
def add_kobo_markup_to_html(root, metadata_lang):
|
||||||
|
root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en')
|
||||||
add_style(root)
|
add_style(root)
|
||||||
for body in XPath('./h:body')(root):
|
for body in XPath('./h:body')(root):
|
||||||
wrap_body_contents(body)
|
inner = wrap_body_contents(body)
|
||||||
|
add_kobo_spans(inner, lang_for_elem(body, root_lang))
|
||||||
|
|
||||||
|
|
||||||
def remove_kobo_markup_from_html(root):
|
def remove_kobo_markup_from_html(root):
|
||||||
@ -84,7 +157,11 @@ def remove_kobo_markup_from_html(root):
|
|||||||
unwrap_body_contents(body)
|
unwrap_body_contents(body)
|
||||||
|
|
||||||
|
|
||||||
def kepubify_html(root):
|
def kepubify_html(root, metadata_lang='en'):
|
||||||
remove_kobo_markup_from_html(root)
|
remove_kobo_markup_from_html(root)
|
||||||
merge_multiple_html_heads_and_bodies(root)
|
merge_multiple_html_heads_and_bodies(root)
|
||||||
add_kobo_markup_to_html(root)
|
add_kobo_markup_to_html(root, metadata_lang)
|
||||||
|
|
||||||
|
|
||||||
|
def kepubify(container):
|
||||||
|
lang = container.mi.language
|
||||||
|
19
src/calibre/ebooks/oeb/polish/tests/kepubify.py
Normal file
19
src/calibre/ebooks/oeb/polish/tests/kepubify.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import serialize
|
||||||
|
from calibre.ebooks.oeb.polish.kepubify import kepubify_html
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse
|
||||||
|
from calibre.ebooks.oeb.polish.tests.base import BaseTest
|
||||||
|
|
||||||
|
|
||||||
|
class KepubifyTests(BaseTest):
|
||||||
|
|
||||||
|
def test_kepubify_html(self):
|
||||||
|
for src, expected in {
|
||||||
|
}.items():
|
||||||
|
root = parse(src)
|
||||||
|
kepubify_html(root)
|
||||||
|
actual = serialize(root, 'text/html').decode('utf-8')
|
||||||
|
self.assertEqual(expected, actual)
|
Loading…
x
Reference in New Issue
Block a user