mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	More work on kepubify
This commit is contained in:
		
							parent
							
								
									f5ff33a926
								
							
						
					
					
						commit
						41b0ca1461
					
				@ -12,13 +12,24 @@
 | 
			
		||||
#     * Cover marking in the OPF
 | 
			
		||||
#     * Markup cleanup (remove various things that trip up the Kobo renderer)
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from calibre.ebooks.oeb.base import XHTML, XPath
 | 
			
		||||
from calibre.ebooks.oeb.parse_utils import merge_multiple_html_heads_and_bodies
 | 
			
		||||
from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
 | 
			
		||||
from calibre.ebooks.oeb.polish.tts import lang_for_elem
 | 
			
		||||
from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
 | 
			
		||||
from calibre.spell.break_iterator import sentence_positions
 | 
			
		||||
from calibre.utils.localization import canonicalize_lang, get_lang
 | 
			
		||||
 | 
			
		||||
KOBO_STYLE_HACKS = 'kobostylehacks'
 | 
			
		||||
OUTER_DIV_ID = 'book-columns'
 | 
			
		||||
INNER_DIV_ID = 'book-inner'
 | 
			
		||||
SKIPPED_TAGS = frozenset((
 | 
			
		||||
    'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
 | 
			
		||||
))
 | 
			
		||||
BLOCK_TAGS = frozenset((
 | 
			
		||||
    'p', 'ol', 'ul', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
 | 
			
		||||
))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_style(root, css='div#book-inner { margin-top: 0; margin-bottom: 0; }', cls=KOBO_STYLE_HACKS) -> bool:
 | 
			
		||||
@ -44,7 +55,7 @@ def remove_kobo_styles(root):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def wrap_body_contents(body):
 | 
			
		||||
    # To match official KEPUBs. Kobo wraps the body with two div tags,
 | 
			
		||||
    # Kobo wraps the body with two div tags,
 | 
			
		||||
    # div#book-columns > div#book-inner to provide a target for applying
 | 
			
		||||
    # pagination styles.
 | 
			
		||||
    for elem in XPath(f'//*[@id="{OUTER_DIV_ID}" or @id={INNER_DIV_ID}]')(body):
 | 
			
		||||
@ -58,6 +69,7 @@ def wrap_body_contents(body):
 | 
			
		||||
        inner.append(child)
 | 
			
		||||
    del body[:]
 | 
			
		||||
    body.append(outer)
 | 
			
		||||
    return inner
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def unwrap_body_contents(body):
 | 
			
		||||
@ -72,10 +84,71 @@ def unwrap_body_contents(body):
 | 
			
		||||
    body.text = text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_kobo_markup_to_html(root):
 | 
			
		||||
def add_kobo_spans(inner, root_lang):
 | 
			
		||||
    stack = []
 | 
			
		||||
    a, p = stack.append, stack.pop
 | 
			
		||||
    a((inner, None, barename(inner.tag).lower(), lang_for_elem(inner, root_lang)))
 | 
			
		||||
    paranum, segnum = 0, 0
 | 
			
		||||
    increment_next_para = True
 | 
			
		||||
    span_tag_name = XHTML('span')
 | 
			
		||||
    leading_whitespace_pat = re.compile(r'^\s+')
 | 
			
		||||
 | 
			
		||||
    def kobo_span(parent):
 | 
			
		||||
        nonlocal paranum, segnum
 | 
			
		||||
        segnum += 1
 | 
			
		||||
        return parent.makeelement(span_tag_name, attrib={'class': 'koboSpan', 'id': f'kobo.{paranum}.{segnum}'})
 | 
			
		||||
 | 
			
		||||
    def wrap_text_in_spans(text: str, parent, at: int, lang: str) -> str | None:
 | 
			
		||||
        nonlocal increment_next_para, paranum, segnum
 | 
			
		||||
        if increment_next_para:
 | 
			
		||||
            paranum += 1
 | 
			
		||||
            segnum = 0
 | 
			
		||||
            increment_next_para = False
 | 
			
		||||
        stripped = leading_whitespace_pat.sub('', text)
 | 
			
		||||
        ws = None
 | 
			
		||||
        if num := len(text) - len(stripped):
 | 
			
		||||
            ws = text[:num]
 | 
			
		||||
        if at:
 | 
			
		||||
            parent[at-1].tail = ws
 | 
			
		||||
        else:
 | 
			
		||||
            parent.text = ws
 | 
			
		||||
        for pos, sz in sentence_positions(stripped, lang):
 | 
			
		||||
            s = kobo_span(parent)
 | 
			
		||||
            s.text = stripped[pos:pos+sz]
 | 
			
		||||
            parent.insert(at, s)
 | 
			
		||||
 | 
			
		||||
    while stack:
 | 
			
		||||
        node, parent, tagname, node_lang = p()
 | 
			
		||||
        if parent is not None:
 | 
			
		||||
            wrap_text_in_spans(node, parent, tagname, node_lang)
 | 
			
		||||
            continue
 | 
			
		||||
        if not increment_next_para and tagname in BLOCK_TAGS:
 | 
			
		||||
            increment_next_para = True
 | 
			
		||||
        if node.text:
 | 
			
		||||
            wrap_text_in_spans(node.text, node, 0, node_lang)
 | 
			
		||||
        for i, child in enumerate(reversed(node)):
 | 
			
		||||
            i = len(node) - 1 - i
 | 
			
		||||
            if child.tail:
 | 
			
		||||
                a((child.tail, node, i + 1, node_lang))
 | 
			
		||||
            if isinstance(child.tag, 'str'):
 | 
			
		||||
                child_name = barename(child.tag).lower()
 | 
			
		||||
                if child_name == 'img':
 | 
			
		||||
                    increment_next_para = False
 | 
			
		||||
                    paranum += 1
 | 
			
		||||
                    segnum = 0
 | 
			
		||||
                    w = kobo_span(node)
 | 
			
		||||
                    w.append(child)
 | 
			
		||||
                    node[i] = w
 | 
			
		||||
                elif child_name not in SKIPPED_TAGS:
 | 
			
		||||
                    a((child, None, child_name, lang_for_elem(child, node_lang)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_kobo_markup_to_html(root, metadata_lang):
 | 
			
		||||
    root_lang = canonicalize_lang(lang_for_elem(root, canonicalize_lang(metadata_lang or get_lang())) or 'en')
 | 
			
		||||
    add_style(root)
 | 
			
		||||
    for body in XPath('./h:body')(root):
 | 
			
		||||
        wrap_body_contents(body)
 | 
			
		||||
        inner = wrap_body_contents(body)
 | 
			
		||||
        add_kobo_spans(inner, lang_for_elem(body, root_lang))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def remove_kobo_markup_from_html(root):
 | 
			
		||||
@ -84,7 +157,11 @@ def remove_kobo_markup_from_html(root):
 | 
			
		||||
        unwrap_body_contents(body)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def kepubify_html(root):
 | 
			
		||||
def kepubify_html(root, metadata_lang='en'):
 | 
			
		||||
    remove_kobo_markup_from_html(root)
 | 
			
		||||
    merge_multiple_html_heads_and_bodies(root)
 | 
			
		||||
    add_kobo_markup_to_html(root)
 | 
			
		||||
    add_kobo_markup_to_html(root, metadata_lang)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def kepubify(container):
 | 
			
		||||
    lang = container.mi.language
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										19
									
								
								src/calibre/ebooks/oeb/polish/tests/kepubify.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								src/calibre/ebooks/oeb/polish/tests/kepubify.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,19 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
# License: GPLv3 Copyright: 2025, Kovid Goyal <kovid at kovidgoyal.net>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from calibre.ebooks.oeb.base import serialize
 | 
			
		||||
from calibre.ebooks.oeb.polish.kepubify import kepubify_html
 | 
			
		||||
from calibre.ebooks.oeb.polish.parsing import parse_html5 as parse
 | 
			
		||||
from calibre.ebooks.oeb.polish.tests.base import BaseTest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class KepubifyTests(BaseTest):
 | 
			
		||||
 | 
			
		||||
    def test_kepubify_html(self):
 | 
			
		||||
        for src, expected in {
 | 
			
		||||
        }.items():
 | 
			
		||||
            root = parse(src)
 | 
			
		||||
            kepubify_html(root)
 | 
			
		||||
            actual = serialize(root, 'text/html').decode('utf-8')
 | 
			
		||||
            self.assertEqual(expected, actual)
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user