More work on kepubify

This commit is contained in:
Kovid Goyal 2025-02-21 10:27:40 +05:30
parent 9c5d1c0f4f
commit e23dcffe43
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -12,24 +12,30 @@
# * Cover marking in the OPF # * Cover marking in the OPF
# * Markup cleanup (remove various things that trip up the Kobo renderer) # * Markup cleanup (remove various things that trip up the Kobo renderer)
import os
import re import re
import sys
from concurrent.futures import ThreadPoolExecutor
from lxml import etree from lxml import etree
from calibre.ebooks.metadata import authors_to_string from calibre.ebooks.metadata import authors_to_string
from calibre.ebooks.oeb.base import XHTML, XPath, escape_cdata from calibre.ebooks.oeb.base import OEB_DOCS, XHTML, XPath, escape_cdata
from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies from calibre.ebooks.oeb.parse_utils import barename, merge_multiple_html_heads_and_bodies
from calibre.ebooks.oeb.polish.container import get_container
from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image3, find_cover_page from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image3, find_cover_page
from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.polish.tts import lang_for_elem from calibre.ebooks.oeb.polish.tts import lang_for_elem
from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing from calibre.ebooks.oeb.polish.utils import extract, insert_self_closing
from calibre.spell.break_iterator import sentence_positions from calibre.spell.break_iterator import sentence_positions
from calibre.srv.render_book import Profiler, calculate_number_of_workers
from calibre.utils.localization import canonicalize_lang, get_lang from calibre.utils.localization import canonicalize_lang, get_lang
KOBO_CSS_CLASS = 'kobostylehacks' KOBO_CSS_CLASS = 'kobostylehacks'
OUTER_DIV_ID = 'book-columns' OUTER_DIV_ID = 'book-columns'
INNER_DIV_ID = 'book-inner' INNER_DIV_ID = 'book-inner'
KOBO_SPAN_CLASS = 'koboSpan' KOBO_SPAN_CLASS = 'koboSpan'
DUMMY_TITLE_PAGE_NAME = 'kobo-title-page-generated-by-calibre'
SKIPPED_TAGS = frozenset(( SKIPPED_TAGS = frozenset((
'', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math' '', 'script', 'style', 'atom', 'pre', 'audio', 'video', 'svg', 'math'
)) ))
@ -224,6 +230,16 @@ def kepubify_html_data(raw: str | bytes, metadata_lang: str = 'en'):
return root return root
def kepubify_html_path(path: str, metadata_lang: str = 'en'):
with open(path, 'r+b') as f:
raw = f.read()
root = kepubify_html_data(raw)
raw = serialize_html(root)
f.seek(0)
f.truncate()
f.write(raw)
def is_probably_a_title_page(root): def is_probably_a_title_page(root):
for title in XPath('//h:title')(root): for title in XPath('//h:title')(root):
if title.text: if title.text:
@ -259,7 +275,7 @@ def add_dummy_title_page(container, cover_image_name):
__CONTENT__ __CONTENT__
</div></div></body></html> </div></div></body></html>
''' '''
titlepage_name = container.add_file('kobo-title-page-generated-by-calibre.html', modify_name_if_needed=True) titlepage_name = container.add_file(f'{DUMMY_TITLE_PAGE_NAME}.html', modify_name_if_needed=True)
if cover_image_name: if cover_image_name:
cover_href = container.name_to_href(cover_image_name, titlepage_name) cover_href = container.name_to_href(cover_image_name, titlepage_name)
html = html.replace('__CONTENT__', f'<img src="{cover_href}" alt="cover" style="height: 100%" />') html = html.replace('__CONTENT__', f'<img src="{cover_href}" alt="cover" style="height: 100%" />')
@ -272,6 +288,15 @@ def add_dummy_title_page(container, cover_image_name):
''') ''')
with container.open(titlepage_name, 'w') as f: with container.open(titlepage_name, 'w') as f:
f.write(html) f.write(html)
container.apply_unique_properties(titlepage_name, 'calibre:title-page')
def remove_dummy_title_page(container):
for name, is_linear in container.spine_names():
if is_linear:
if DUMMY_TITLE_PAGE_NAME in name:
container.remove_item(name)
break
def first_spine_item_is_probably_cover(container) -> bool: def first_spine_item_is_probably_cover(container) -> bool:
@ -285,10 +310,55 @@ def first_spine_item_is_probably_cover(container) -> bool:
return False return False
def kepubify_container(container): def kepubify_container(container, max_workers=0):
lang = container.mi.language remove_dummy_title_page(container)
metadata_lang = container.mi.language
cover_image_name = find_cover_image(container) or find_cover_image3(container) cover_image_name = find_cover_image(container) or find_cover_image3(container)
if cover_image_name: if cover_image_name:
container.apply_unique_properties(cover_image_name, 'cover-image') container.apply_unique_properties(cover_image_name, 'cover-image')
if not find_cover_page(container) and not first_spine_item_is_probably_cover(container): if not find_cover_page(container) and not first_spine_item_is_probably_cover(container):
add_dummy_title_page(container, cover_image_name) add_dummy_title_page(container, cover_image_name)
names_that_need_work = tuple(name for name, mt in container.mime_map.items() if mt in OEB_DOCS)
num_workers = calculate_number_of_workers(names_that_need_work, container, max_workers)
paths = tuple(map(container.name_to_abspath, names_that_need_work))
if num_workers < 2:
for path in paths:
kepubify_html_path(path, metadata_lang)
else:
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = tuple(executor.submit(kepubify_html_path, path, metadata_lang) for path in paths)
for future in futures:
future.result()
def profile():
from calibre.ptempfile import TemporaryDirectory
path = sys.argv[-1]
with TemporaryDirectory() as tdir, Profiler():
main(path, max_workers=1)
def develop():
from zipfile import ZipFile
from calibre.ptempfile import TemporaryDirectory
path = sys.argv[-1]
with TemporaryDirectory() as tdir:
outpath = main(path, max_workers=1)
with ZipFile(outpath) as zf:
zf.extractall(tdir)
print('Extracted to:', tdir)
input('Press Enter to quit')
def main(path, max_workers=0):
container = get_container(path, tweak_mode=True)
kepubify_container(container, max_workers=max_workers)
base, ext = os.path.splitext(path)
outpath = base + '.kepub'
container.commit(output=outpath)
return outpath
if __name__ == '__main__':
main(sys.argv[-1])