E-book viewer: Speed up first time open for EPUB files with lots of styling

The transformation of CSS is now in native code and an order of
magnitude faster. For typical novel length books the total time
for first load processing has been halved.
This commit is contained in:
Kovid Goyal 2021-04-06 13:20:15 +05:30
parent ac68f4550c
commit a62bd36fe4
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -12,17 +12,12 @@ from collections import defaultdict
from datetime import datetime from datetime import datetime
from functools import partial from functools import partial
from itertools import count from itertools import count
from math import ceil
from css_parser import replaceUrls
from css_parser.css import CSSRule
from lxml.etree import Comment from lxml.etree import Comment
from math import ceil
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
from calibre.constants import iswindows from calibre.constants import iswindows
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks import parse_css_length
from calibre.ebooks.css_transform_rules import StyleDeclaration
from calibre.ebooks.oeb.base import ( from calibre.ebooks.oeb.base import (
OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath,
rewrite_links, urlunquote rewrite_links, urlunquote
@ -32,7 +27,7 @@ from calibre.ebooks.oeb.polish.container import Container as ContainerBase
from calibre.ebooks.oeb.polish.cover import ( from calibre.ebooks.oeb.polish.cover import (
find_cover_image, find_cover_image_in_page, find_cover_page find_cover_image, find_cover_image_in_page, find_cover_page
) )
from calibre.ebooks.oeb.polish.css import transform_inline_styles from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
from calibre.ebooks.oeb.polish.utils import guess_type from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ptempfile import PersistentTemporaryDirectory from calibre.ptempfile import PersistentTemporaryDirectory
@ -46,13 +41,14 @@ from calibre.utils.serialize import (
json_dumps, json_loads, msgpack_dumps, msgpack_loads json_dumps, json_loads, msgpack_dumps, msgpack_loads
) )
from calibre.utils.short_uuid import uuid4 from calibre.utils.short_uuid import uuid4
from calibre_extensions import speedup
from calibre_extensions.fast_css_transform import transform_properties
from polyglot.binary import ( from polyglot.binary import (
as_base64_unicode as encode_component, from_base64_bytes, as_base64_unicode as encode_component, from_base64_bytes,
from_base64_unicode as decode_component from_base64_unicode as decode_component
) )
from polyglot.builtins import as_bytes, iteritems, map, unicode_type from polyglot.builtins import as_bytes, iteritems, map, unicode_type
from polyglot.urllib import quote, urlparse from polyglot.urllib import quote, urlparse
from calibre_extensions import speedup
RENDER_VERSION = 1 RENDER_VERSION = 1
@ -85,19 +81,6 @@ def decode_url(x):
return decode_component(parts[0]), (parts[1] if len(parts) > 1 else '') return decode_component(parts[0]), (parts[1] if len(parts) > 1 else '')
absolute_units = frozenset('px mm cm pt in pc q'.split())
length_factors = {'mm':2.8346456693, 'cm':28.346456693, 'in': 72, 'pc': 12, 'q':0.708661417325}
def convert_fontsize(length, unit, base_font_size=16.0, dpi=96.0):
' Convert font size to rem so that font size scaling works. Assumes the document has the specified base font size in px '
if unit == 'px':
return length/base_font_size
pt_to_px = dpi / 72.0
pt_to_rem = pt_to_px / base_font_size
return length * length_factors.get(unit, 1) * pt_to_rem
def create_link_replacer(container, link_uid, changed): def create_link_replacer(container, link_uid, changed):
resource_template = link_uid + '|{}|' resource_template = link_uid + '|{}|'
@ -134,65 +117,6 @@ def create_link_replacer(container, link_uid, changed):
return link_replacer return link_replacer
page_break_properties = ('page-break-before', 'page-break-after', 'page-break-inside')
absolute_font_sizes = {
'xx-small': '0.5rem', 'x-small': '0.625rem', 'small': '0.8rem',
'medium': '1rem',
'large': '1.125rem', 'x-large': '1.5rem', 'xx-large': '2rem', 'xxx-large': '2.55rem'
}
nonstandard_writing_mode_property_names = ('-webkit-writing-mode', '-epub-writing-mode')
def transform_declaration(decl):
decl = StyleDeclaration(decl)
changed = False
nonstandard_writing_mode_props = {}
standard_writing_mode_props = {}
for prop, parent_prop in tuple(decl):
if prop.name in page_break_properties:
changed = True
name = prop.name.partition('-')[2]
for prefix in ('', '-webkit-column-'):
# Note that Firefox does not support break-after at all
# https://bugzil.la/549114
decl.set_property(prefix + name, prop.value, prop.priority)
decl.remove_property(prop, parent_prop)
elif prop.name == 'font-size':
raw = prop.value
afs = absolute_font_sizes.get(raw)
if afs is not None:
changed = True
decl.change_property(prop, parent_prop, afs)
continue
l, unit = parse_css_length(raw)
if unit in absolute_units:
changed = True
l = convert_fontsize(l, unit)
decl.change_property(prop, parent_prop, unicode_type(l) + 'rem')
elif prop.name in nonstandard_writing_mode_property_names:
nonstandard_writing_mode_props[prop.value] = prop.priority
elif prop.name == 'writing-mode':
standard_writing_mode_props[prop.value] = True
# Add standard writing-mode properties if they don't exist so that
# all of the browsers supported by the viewer work in vertical modes
for value, priority in nonstandard_writing_mode_props.items():
if value not in standard_writing_mode_props:
decl.set_property('writing-mode', value, priority)
changed = True
return changed
def transform_sheet(sheet):
changed = False
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if transform_declaration(rule.style):
changed = True
return changed
def check_for_maths(root): def check_for_maths(root):
for x in root.iterdescendants('{*}math'): for x in root.iterdescendants('{*}math'):
return True return True
@ -361,27 +285,25 @@ def create_cover_page(container, input_fmt, is_comic, book_metadata=None):
def transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names): def transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names):
changed = False changed = False
sheet = container.parsed(name) link_replacer = None
if virtualize_resources: if virtualize_resources:
changed_names = set() changed_names = set()
link_replacer = create_link_replacer(container, link_uid, changed_names) link_replacer = partial(create_link_replacer(container, link_uid, changed_names), name)
replaceUrls(sheet, partial(link_replacer, name))
if name in changed_names: if name in changed_names:
changed = True changed = True
virtualized_names.add(name) virtualized_names.add(name)
if transform_sheet(sheet): raw = container.raw_data(name, decode=True)
nraw = transform_properties(raw, is_declaration=False, url_callback=link_replacer)
if nraw != raw:
changed = True changed = True
if changed: raw = nraw
raw = container.serialize_item(name)
else:
raw = container.raw_data(name, decode=False)
raw = raw.lstrip() raw = raw.lstrip()
if not raw.startswith(b'@charset'): if not raw.startswith('@charset'):
raw = b'@charset "UTF-8";\n' + raw raw = '@charset "UTF-8";\n' + raw
changed = True changed = True
if changed: if changed:
with container.open(name, 'wb') as f: with container.open(name, 'wb') as f:
f.write(raw) f.write(raw.encode('utf-8'))
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names): def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names):
@ -402,6 +324,26 @@ def transform_svg_image(container, name, link_uid, virtualize_resources, virtual
container.commit_item(name) container.commit_item(name)
def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name)
changed = False
for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
nraw = transform_sheet(style.text)
if nraw != style.text:
changed = True
style.text = nraw
pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'):
text = elem.get('style', None)
if text:
ntext = transform_style(text)
if ntext != text:
changed = True
elem.set('style', ntext)
return changed
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names): def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names):
link_xpath = XPath('//h:a[@href]') link_xpath = XPath('//h:a[@href]')
svg_link_xpath = XPath('//svg:a') svg_link_xpath = XPath('//svg:a')
@ -425,17 +367,21 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
if ltype != 'text/css' or rel != 'stylesheet': if ltype != 'text/css' or rel != 'stylesheet':
link.attrib.clear() link.attrib.clear()
def transform_and_virtualize_sheet(sheet): # URLs in the inline CSS will be replaced in virtualize_html
changed = transform_sheet(sheet) def transform_sheet(sheet_text):
if virtualize_resources: ans = transform_properties(sheet_text, is_declaration=False)
replaceUrls(sheet, partial(link_replacer, name)) if name in changed_names:
if name in changed_names: virtualized_names.add(name)
virtualized_names.add(name) return ans
changed = True
return changed def transform_declaration(decl_text):
ans = transform_properties(decl_text, is_declaration=True)
if name in changed_names:
virtualized_names.add(name)
return ans
# Transform <style> and style="" # Transform <style> and style=""
transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration) transform_inline_styles(container, name, transform_sheet=transform_sheet, transform_style=transform_declaration)
if virtualize_resources: if virtualize_resources:
virtualize_html(container, name, link_uid, link_to_map, virtualized_names) virtualize_html(container, name, link_uid, link_to_map, virtualized_names)
@ -835,8 +781,8 @@ def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, ex
with RenderManager(max_workers) as render_manager: with RenderManager(max_workers) as render_manager:
mi = None mi = None
if serialize_metadata: if serialize_metadata:
from calibre.ebooks.metadata.meta import get_metadata
from calibre.customize.ui import quick_metadata from calibre.customize.ui import quick_metadata
from calibre.ebooks.metadata.meta import get_metadata
with lopen(pathtoebook, 'rb') as f, quick_metadata: with lopen(pathtoebook, 'rb') as f, quick_metadata:
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower()) mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log) book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
@ -915,10 +861,12 @@ def develop():
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
path = sys.argv[-1] path = sys.argv[-1]
with TemporaryDirectory() as tdir: with TemporaryDirectory() as tdir:
return render( render(
path, tdir, serialize_metadata=True, path, tdir, serialize_metadata=True,
extract_annotations=True, virtualize_resources=False, max_workers=1 extract_annotations=True, virtualize_resources=True, max_workers=1
) )
print('Extracted to:', tdir)
input('Press Enter to quit')
if __name__ == '__main__': if __name__ == '__main__':