Viewer: Run the first read book preparation in parallel

This commit is contained in:
Kovid Goyal 2019-10-21 16:20:00 +05:30
parent 243144f3b9
commit 75dd89722a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,22 +7,26 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
import re
import shutil
import sys
import time
from collections import defaultdict
from datetime import datetime
from functools import partial
from itertools import count
from math import ceil
from css_parser import replaceUrls
from css_parser.css import CSSRule
from calibre import force_unicode, prepare_string_for_xml
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
from calibre.constants import iswindows
from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks import parse_css_length
from calibre.ebooks.css_transform_rules import StyleDeclaration
from calibre.ebooks.oeb.base import (
EPUB_NS, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath, rewrite_links,
urlunquote
EPUB_NS, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath,
rewrite_links, urlunquote
)
from calibre.ebooks.oeb.iterator.book import extract_book
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
@ -31,9 +35,12 @@ from calibre.ebooks.oeb.polish.cover import (
)
from calibre.ebooks.oeb.polish.css import transform_inline_styles
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
from calibre.ebooks.oeb.polish.utils import extract, guess_type
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.srv.metadata import encode_datetime
from calibre.srv.opts import grouper
from calibre.utils.date import EPOCH
from calibre.utils.ipc.simple_worker import start_pipe_worker
from calibre.utils.iso8601 import parse_iso8601
from calibre.utils.logging import default_log
from calibre.utils.serialize import json_loads
@ -42,7 +49,9 @@ from polyglot.binary import (
as_base64_unicode as encode_component, from_base64_bytes,
from_base64_unicode as decode_component
)
from polyglot.builtins import is_py3, iteritems, map, unicode_type
from polyglot.builtins import (
as_bytes, is_py3, iteritems, itervalues, map, unicode_type
)
from polyglot.urllib import quote, urlparse
RENDER_VERSION = 1
@ -220,297 +229,429 @@ def toc_anchor_map(toc):
return dict(ans)
class Container(ContainerBase):
def serialize_parsed_html(root):
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
class SimpleContainer(ContainerBase):
tweak_mode = True
def __init__(
self, book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False,
book_metadata=None, allow_no_cover=True, virtualize_resources=True
):
log = log or default_log
self.allow_no_cover = allow_no_cover
ContainerBase.__init__(self, tdir, opfpath, log)
self.book_metadata = book_metadata
input_plugin = plugin_for_input_format(input_fmt)
self.is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
if save_bookmark_data:
bm_file = 'META-INF/calibre_bookmarks.txt'
self.bookmark_data = None
if self.exists(bm_file):
with self.open(bm_file, 'rb') as f:
self.bookmark_data = f.read()
# We do not add zero byte sized files as the IndexedDB API in the
# browser has no good way to distinguish between zero byte files and
# load failures.
excluded_names = {
name for name, mt in iteritems(self.mime_map) if
name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
name == 'mimetype' or not self.has_name_and_is_not_empty(name)}
raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())
toc = get_toc(self).to_dict(count())
if not toc or not toc.get('children'):
toc = from_xpaths(self, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
spine = [name for name, is_linear in self.spine_names]
spineq = frozenset(spine)
landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]
def create_cover_page(container, input_fmt, allow_no_cover, book_metadata=None):
templ = '''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head><style>
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
img {
width: 100%%; height: 100%%;
object-fit: contain;
margin-left: auto; margin-right: auto;
max-width: 100vw; max-height: 100vh;
top: 50vh; transform: translateY(-50%%);
position: relative;
}
body.cover-fill img { object-fit: fill; }
</style></head><body><img src="%s"/></body></html>
'''
self.book_render_data = data = {
'version': RENDER_VERSION,
'toc':toc,
'book_format': book_fmt,
'spine':spine,
'link_uid': uuid4(),
'book_hash': book_hash,
'is_comic': self.is_comic,
'raster_cover_name': raster_cover_name,
'title_page_name': titlepage_name,
'has_maths': False,
'total_length': 0,
'spine_length': 0,
'toc_anchor_map': toc_anchor_map(toc),
'landmarks': landmarks,
'link_to_map': {},
}
# Mark the spine as dirty since we have to ensure it is normalized
for name in data['spine']:
self.parsed(name), self.dirty(name)
self.virtualized_names = set()
self.transform_all(virtualize_resources)
def generic_cover():
if book_metadata is not None:
from calibre.ebooks.covers import create_cover
mi = book_metadata
return create_cover(mi.title, mi.authors, mi.series, mi.series_index)
return BLANK_JPEG
def manifest_data(name):
mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
ans = {
'size':os.path.getsize(self.name_path_map[name]),
'is_virtualized': name in self.virtualized_names,
'mimetype':mt,
'is_html': mt in OEB_DOCS,
}
if ans['is_html']:
root = self.parsed(name)
ans['length'] = l = get_length(root)
self.book_render_data['total_length'] += l
if name in data['spine']:
self.book_render_data['spine_length'] += l
ans['has_maths'] = hm = check_for_maths(root)
if hm:
self.book_render_data['has_maths'] = True
ans['anchor_map'] = anchor_map(root)
return ans
data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
self.commit()
for name in excluded_names:
os.remove(self.name_path_map[name])
data = json.dumps(self.book_render_data, ensure_ascii=False)
if not isinstance(data, bytes):
data = data.encode('utf-8')
with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
f.write(data)
if input_fmt == 'epub':
def create_cover_page(self, input_fmt):
templ = '''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head><style>
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
img {
width: 100%%; height: 100%%;
object-fit: contain;
margin-left: auto; margin-right: auto;
max-width: 100vw; max-height: 100vh;
top: 50vh; transform: translateY(-50%%);
position: relative;
}
body.cover-fill img { object-fit: fill; }
</style></head><body><img src="%s"/></body></html>
'''
def image_callback(cover_image, wrapped_image):
if cover_image:
image_callback.cover_data = container.raw_data(cover_image, decode=False)
if wrapped_image and not getattr(image_callback, 'cover_data', None):
image_callback.cover_data = container.raw_data(wrapped_image, decode=False)
def generic_cover():
if self.book_metadata is not None:
from calibre.ebooks.covers import create_cover
mi = self.book_metadata
return create_cover(mi.title, mi.authors, mi.series, mi.series_index)
return BLANK_JPEG
def cover_path(action, data):
if action == 'write_image':
cdata = getattr(image_callback, 'cover_data', None) or generic_cover()
data.write(cdata)
if input_fmt == 'epub':
def image_callback(cover_image, wrapped_image):
if cover_image:
image_callback.cover_data = self.raw_data(cover_image, decode=False)
if wrapped_image and not getattr(image_callback, 'cover_data', None):
image_callback.cover_data = self.raw_data(wrapped_image, decode=False)
def cover_path(action, data):
if action == 'write_image':
cdata = getattr(image_callback, 'cover_data', None) or generic_cover()
data.write(cdata)
if self.allow_no_cover and not has_epub_cover(self):
if allow_no_cover and not has_epub_cover(container):
return None, None
raster_cover_name, titlepage_name = set_epub_cover(
container, cover_path, (lambda *a: None), options={'template':templ},
image_callback=image_callback)
else:
raster_cover_name = find_cover_image(container, strict=True)
if raster_cover_name is None:
if allow_no_cover:
return None, None
raster_cover_name, titlepage_name = set_epub_cover(
self, cover_path, (lambda *a: None), options={'template':templ},
image_callback=image_callback)
else:
raster_cover_name = find_cover_image(self, strict=True)
if raster_cover_name is None:
if self.allow_no_cover:
return None, None
item = self.generate_item(name='cover.jpeg', id_prefix='cover')
raster_cover_name = self.href_to_name(item.get('href'), self.opf_name)
with self.open(raster_cover_name, 'wb') as dest:
dest.write(generic_cover())
if self.is_comic:
return raster_cover_name, None
item = self.generate_item(name='titlepage.html', id_prefix='titlepage')
titlepage_name = self.href_to_name(item.get('href'), self.opf_name)
raw = templ % prepare_string_for_xml(self.name_to_href(raster_cover_name, titlepage_name), True)
with self.open(titlepage_name, 'wb') as f:
f.write(raw.encode('utf-8'))
spine = self.opf_xpath('//opf:spine')[0]
ref = spine.makeelement(OPF('itemref'), idref=item.get('id'))
self.insert_into_xml(spine, ref, index=0)
self.dirty(self.opf_name)
return raster_cover_name, titlepage_name
item = container.generate_item(name='cover.jpeg', id_prefix='cover')
raster_cover_name = container.href_to_name(item.get('href'), container.opf_name)
with container.open(raster_cover_name, 'wb') as dest:
dest.write(generic_cover())
if container.is_comic:
return raster_cover_name, None
item = container.generate_item(name='titlepage.html', id_prefix='titlepage')
titlepage_name = container.href_to_name(item.get('href'), container.opf_name)
raw = templ % prepare_string_for_xml(container.name_to_href(raster_cover_name, titlepage_name), True)
with container.open(titlepage_name, 'wb') as f:
f.write(raw.encode('utf-8'))
spine = container.opf_xpath('//opf:spine')[0]
ref = spine.makeelement(OPF('itemref'), idref=item.get('id'))
container.insert_into_xml(spine, ref, index=0)
container.dirty(container.opf_name)
return raster_cover_name, titlepage_name
def transform_html(self, name, virtualize_resources):
style_xpath = XPath('//h:style')
link_xpath = XPath('//h:a[@href]')
img_xpath = XPath('//h:img[@src]')
res_link_xpath = XPath('//h:link[@href]')
root = self.parsed(name)
head = ensure_head(root)
changed = False
for style in style_xpath(root):
# Firefox flakes out sometimes when dynamically creating <style> tags,
# so convert them to external stylesheets to ensure they never fail
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
in_head = has_ancestor(style, head)
if not in_head:
extract(style)
head.append(style)
css = style.text
style.clear()
style.tag = XHTML('link')
style.set('type', 'text/css')
style.set('rel', 'stylesheet')
sname = self.add_file(name + '.css', css.encode('utf-8'), modify_name_if_needed=True)
style.set('href', self.name_to_href(sname, name))
changed = True
# Used for viewing images
for img in img_xpath(root):
img_name = self.href_to_name(img.get('src'), name)
if img_name:
img.set('data-calibre-src', img_name)
changed = True
# Disable non stylsheet link tags. This link will not be loaded by the
# browser anyway and will causes the resource load check to hang
for link in res_link_xpath(root):
ltype = (link.get('type') or 'text/css').lower()
rel = (link.get('rel') or 'stylesheet').lower()
if ltype != 'text/css' or rel != 'stylesheet':
link.attrib.clear()
changed = True
# Transform <style> and style=""
if transform_inline_styles(self, name, transform_sheet=transform_sheet, transform_style=transform_declaration):
def transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names):
changed = False
sheet = container.parsed(name)
if virtualize_resources:
changed_names = set()
link_replacer = create_link_replacer(container, link_uid, changed_names)
replaceUrls(sheet, partial(link_replacer, name))
if name in changed_names:
changed = True
virtualized_names.add(name)
if transform_sheet(sheet):
changed = True
if changed:
raw = container.serialize_item(name)
else:
raw = container.raw_data(name, decode=False)
raw = raw.lstrip()
if not raw.startswith(b'@charset'):
raw = b'@charset "UTF-8";\n' + raw
changed = True
if changed:
with container.open(name, 'wb') as f:
f.write(raw)
if not virtualize_resources:
link_uid = self.book_render_data['link_uid']
link_replacer = create_link_replacer(self, link_uid, set())
ltm = self.book_render_data['link_to_map']
for a in link_xpath(root):
href = link_replacer(name, a.get('href'))
if href and href.startswith(link_uid):
a.set('href', 'javascript:void(0)')
parts = decode_url(href.split('|')[1])
lname, lfrag = parts[0], parts[1]
ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
changed = True
if changed:
self.dirty(name)
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names):
if not virtualize_resources:
return
link_replacer = create_link_replacer(container, link_uid, set())
xlink = XLINK('href')
altered = False
xlink_xpath = XPath('//*[@xl:href]')
for elem in xlink_xpath(container.parsed(name)):
href = elem.get(xlink)
if not href.startswith('#'):
elem.set(xlink, link_replacer(name, href))
altered = True
if altered:
virtualized_names.add(name)
container.dirty(name)
container.commit_item(name)
def transform_css(self, name):
sheet = self.parsed(name)
if transform_sheet(sheet):
self.dirty(name)
def transform_all(self, virtualize_resources):
for name, mt in tuple(iteritems(self.mime_map)):
mt = mt.lower()
if mt in OEB_DOCS:
self.transform_html(name, virtualize_resources)
for name, mt in tuple(iteritems(self.mime_map)):
mt = mt.lower()
if mt in OEB_STYLES:
self.transform_css(name)
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names):
link_xpath = XPath('//h:a[@href]')
img_xpath = XPath('//h:img[@src]')
res_link_xpath = XPath('//h:link[@href]')
root = container.parsed(name)
changed_names = set()
link_replacer = create_link_replacer(container, link_uid, changed_names)
# Used for viewing images
for img in img_xpath(root):
img_name = container.href_to_name(img.get('src'), name)
if img_name:
img.set('data-calibre-src', img_name)
# Disable non-stylesheet link tags. This link will not be loaded by the
# browser anyway and will causes the resource load check to hang
for link in res_link_xpath(root):
ltype = (link.get('type') or 'text/css').lower()
rel = (link.get('rel') or 'stylesheet').lower()
if ltype != 'text/css' or rel != 'stylesheet':
link.attrib.clear()
def transform_and_virtualize_sheet(sheet):
changed = transform_sheet(sheet)
if virtualize_resources:
self.virtualize_resources()
replaceUrls(sheet, partial(link_replacer, name))
if name in changed_names:
virtualized_names.add(name)
changed = True
return changed
ltm = self.book_render_data['link_to_map']
for name, amap in iteritems(ltm):
for k, v in tuple(iteritems(amap)):
amap[k] = tuple(v) # needed for JSON serialization
# Transform <style> and style=""
transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration)
def virtualize_resources(self):
if virtualize_resources:
virtualize_html(container, name, link_uid, link_to_map, virtualized_names)
else:
for a in link_xpath(root):
href = link_replacer(name, a.get('href'))
if href and href.startswith(link_uid):
a.set('href', 'javascript:void(0)')
parts = decode_url(href.split('|')[1])
lname, lfrag = parts[0], parts[1]
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
changed = set()
link_uid = self.book_render_data['link_uid']
xlink_xpath = XPath('//*[@xl:href]')
link_xpath = XPath('//h:a[@href]')
link_replacer = create_link_replacer(self, link_uid, changed)
shtml = serialize_parsed_html(root)
with container.open(name, 'wb') as f:
f.write(shtml)
ltm = self.book_render_data['link_to_map']
for name, mt in iteritems(self.mime_map):
mt = mt.lower()
if mt in OEB_STYLES:
replaceUrls(self.parsed(name), partial(link_replacer, name))
self.virtualized_names.add(name)
elif mt in OEB_DOCS:
self.virtualized_names.add(name)
root = self.parsed(name)
rewrite_links(root, partial(link_replacer, name))
for a in link_xpath(root):
href = a.get('href')
if href.startswith(link_uid):
a.set('href', 'javascript:void(0)')
parts = decode_url(href.split('|')[1])
lname, lfrag = parts[0], parts[1]
ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
else:
a.set('target', '_blank')
a.set('rel', 'noopener noreferrer')
elif mt == 'image/svg+xml':
self.virtualized_names.add(name)
xlink = XLINK('href')
altered = False
for elem in xlink_xpath(self.parsed(name)):
href = elem.get(xlink)
if not href.startswith('#'):
elem.set(xlink, link_replacer(name, href))
altered = True
if altered:
changed.add(name)
class RenderManager(object):
tuple(map(self.dirty, changed))
def launch_worker(self):
with lopen(os.path.join(self.tdir, '{}.json'.format(len(self.workers))), 'wb') as output:
error = lopen(os.path.join(self.tdir, '{}.error'.format(len(self.workers))), 'wb')
p = start_pipe_worker('from calibre.srv.render_book import worker_main; worker_main()', stdout=error, stderr=error)
p.output_path = output.name
p.error_path = error.name
self.workers.append(p)
def serialize_item(self, name):
mt = (self.mime_map[name] or '').lower()
if mt in OEB_STYLES:
ans = ContainerBase.serialize_item(self, name).lstrip()
if not ans.startswith(b'@charset'):
ans = b'@charset "UTF-8";\n' + ans
return ans
if mt not in OEB_DOCS:
return ContainerBase.serialize_item(self, name)
root = self.parsed(name)
return json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')).encode('utf-8')
def __enter__(self):
self.workers = []
self.tdir = PersistentTemporaryDirectory()
self.launch_worker(), self.launch_worker()
return self
def __exit__(self, *a):
while self.workers:
p = self.workers.pop()
if p.returncode is None:
p.terminate()
if not iswindows and p.poll() is None:
time.sleep(0.02)
if p.poll() is None:
p.kill()
del self.workers
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
time.sleep(0.1)
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
pass
del self.tdir
def __call__(self, names, args, in_process_container):
num_workers = min(detect_ncpus(), len(names))
if num_workers > 1:
total_sz = sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names)
if total_sz < 128 * 1024:
num_workers = 1
if num_workers == 1:
return [process_book_files(names, *args, container=in_process_container)]
while len(self.workers) < num_workers:
self.launch_worker()
group_sz = int(ceil(len(names) / num_workers))
for group, worker in zip(grouper(group_sz, names), self.workers):
worker.stdin.write(as_bytes(json.dumps((worker.output_path, group,) + args)))
worker.stdin.flush(), worker.stdin.close()
worker.job_sent = True
for worker in self.workers:
if not hasattr(worker, 'job_sent'):
worker.stdin.write(b'_'), worker.stdin.flush(), worker.stdin.close()
error = None
results = []
for worker in self.workers:
if not hasattr(worker, 'job_sent'):
worker.wait()
continue
if worker.wait() != 0:
with lopen(worker.error_path, 'rb') as f:
error = f.read().decode('utf-8', 'replace')
else:
with lopen(worker.output_path, 'rb') as f:
results.append(json.loads(f.read()))
if error is not None:
raise Exception('Render worker failed with error:\n' + error)
return results
def worker_main():
stdin = getattr(sys.stdin, 'buffer', sys.stdin)
raw = stdin.read()
if raw == b'_':
return
args = json.loads(raw)
result = process_book_files(*args[1:])
with open(args[0], 'wb') as f:
f.write(as_bytes(json.dumps(result)))
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
changed = set()
link_xpath = XPath('//h:a[@href]')
link_replacer = create_link_replacer(container, link_uid, changed)
virtualized_names.add(name)
root = container.parsed(name)
rewrite_links(root, partial(link_replacer, name))
for a in link_xpath(root):
href = a.get('href')
if href.startswith(link_uid):
a.set('href', 'javascript:void(0)')
parts = decode_url(href.split('|')[1])
lname, lfrag = parts[0], parts[1]
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
else:
a.set('target', '_blank')
a.set('rel', 'noopener noreferrer')
return name in changed
def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, container=None):
container = container or SimpleContainer(container_dir, opfpath, default_log)
link_to_map = {}
html_data = {}
virtualized_names = set()
for name in names:
if name is None:
continue
mt = container.mime_map[name].lower()
if mt in OEB_DOCS:
root = container.parsed(name)
html_data[name] = {
'length': get_length(root),
'has_maths': check_for_maths(root),
'anchor_map': anchor_map(root)
}
transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names)
elif mt in OEB_STYLES:
transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names)
elif mt == 'image/svg+xml':
transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names)
for v in itervalues(link_to_map):
for k in v:
v[k] = tuple(v[k])
return link_to_map, html_data, tuple(virtualized_names)
def process_exploded_book(
book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False,
book_metadata=None, allow_no_cover=True, virtualize_resources=True
):
log = log or default_log
container = SimpleContainer(tdir, opfpath, log)
input_plugin = plugin_for_input_format(input_fmt)
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
bookmark_data = None
if save_bookmark_data:
bm_file = 'META-INF/calibre_bookmarks.txt'
if container.exists(bm_file):
with container.open(bm_file, 'rb') as f:
bookmark_data = f.read()
# We do not add zero byte sized files as the IndexedDB API in the
# browser has no good way to distinguish between zero byte files and
# load failures.
excluded_names = {
name for name, mt in iteritems(container.mime_map) if
name == container.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
name == 'mimetype' or not container.has_name_and_is_not_empty(name)}
raster_cover_name, titlepage_name = create_cover_page(container, input_fmt.lower(), allow_no_cover, book_metadata)
toc = get_toc(container, verify_destinations=False).to_dict(count())
if not toc or not toc.get('children'):
toc = from_xpaths(container, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
spine = [name for name, is_linear in container.spine_names]
spineq = frozenset(spine)
landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq]
book_render_data = {
'version': RENDER_VERSION,
'toc':toc,
'book_format': book_fmt,
'spine':spine,
'link_uid': uuid4(),
'book_hash': book_hash,
'is_comic': is_comic,
'raster_cover_name': raster_cover_name,
'title_page_name': titlepage_name,
'has_maths': False,
'total_length': 0,
'spine_length': 0,
'toc_anchor_map': toc_anchor_map(toc),
'landmarks': landmarks,
'link_to_map': {},
}
def work_priority(name):
# ensure workers with large files or stylesheets
# have the less names
size = os.path.getsize(container.name_path_map[name]),
is_html = container.mime_map.get(name) in OEB_DOCS
return (0 if is_html else 1), size
names = sorted(
(n for n, mt in iteritems(container.mime_map) if mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'),
key=work_priority)
results = render_manager(names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid']), container)
ltm = book_render_data['link_to_map']
html_data = {}
virtualized_names = set()
def merge_ltm(dest, src):
for k, v in iteritems(src):
if k in dest:
dest[k] |= v
else:
dest[k] = v
for link_to_map, hdata, vnames in results:
html_data.update(hdata)
virtualized_names |= set(vnames)
for k, v in iteritems(link_to_map):
for x in v:
v[x] = set(v[x])
if k in ltm:
merge_ltm(ltm[k], v)
else:
ltm[k] = v
def manifest_data(name):
mt = (container.mime_map.get(name) or 'application/octet-stream').lower()
ans = {
'size':os.path.getsize(container.name_path_map[name]),
'is_virtualized': name in virtualized_names,
'mimetype':mt,
'is_html': mt in OEB_DOCS,
}
if ans['is_html']:
data = html_data[name]
ans['length'] = l = data['length']
book_render_data['total_length'] += l
if name in book_render_data['spine']:
book_render_data['spine_length'] += l
ans['has_maths'] = hm = data['has_maths']
if hm:
book_render_data['has_maths'] = True
ans['anchor_map'] = data['anchor_map']
return ans
book_render_data['files'] = {name:manifest_data(name) for name in set(container.name_path_map) - excluded_names}
container.commit()
for name in excluded_names:
os.remove(container.name_path_map[name])
ltm = book_render_data['link_to_map']
for name, amap in iteritems(ltm):
for k, v in tuple(iteritems(amap)):
amap[k] = tuple(v) # needed for JSON serialization
data = as_bytes(json.dumps(book_render_data, ensure_ascii=False))
with lopen(os.path.join(container.root, 'calibre-book-manifest.json'), 'wb') as f:
f.write(data)
return container, bookmark_data
def split_name(name):
@ -566,22 +707,6 @@ def serialize_elem(elem, nsmap):
return ans
def ensure_head(root):
# Make sure we have only a single <head>
heads = list(root.iterchildren(XHTML('head')))
if len(heads) != 1:
if not heads:
root.insert(0, root.makeelement(XHTML('head')))
return root[0]
head = heads[0]
for eh in heads[1:]:
for child in eh.iterchildren('*'):
head.append(child)
extract(eh)
return head
return heads[0]
def ensure_body(root):
# Make sure we have only a single <body>
bodies = list(root.iterchildren(XHTML('body')))
@ -685,33 +810,34 @@ def get_stored_annotations(container):
def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True):
mi = None
if serialize_metadata:
from calibre.ebooks.metadata.meta import get_metadata
from calibre.customize.ui import quick_metadata
with lopen(pathtoebook, 'rb') as f, quick_metadata:
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
container = Container(
book_fmt, opfpath, input_fmt, output_dir, book_hash=book_hash,
save_bookmark_data=extract_annotations,
book_metadata=mi, virtualize_resources=virtualize_resources
)
if serialize_metadata:
from calibre.utils.serialize import json_dumps
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
d = metadata_as_dict(mi)
d.pop('cover_data', None)
serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
with lopen(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
f.write(json_dumps(d))
if extract_annotations:
annotations = None
if container.bookmark_data:
annotations = json_dumps(tuple(get_stored_annotations(container)))
if annotations:
with lopen(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
f.write(annotations)
with RenderManager() as render_manager:
mi = None
if serialize_metadata:
from calibre.ebooks.metadata.meta import get_metadata
from calibre.customize.ui import quick_metadata
with lopen(pathtoebook, 'rb') as f, quick_metadata:
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
container, bookmark_data = process_exploded_book(
book_fmt, opfpath, input_fmt, output_dir, render_manager,
book_hash=book_hash, save_bookmark_data=extract_annotations,
book_metadata=mi, virtualize_resources=virtualize_resources
)
if serialize_metadata:
from calibre.utils.serialize import json_dumps
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
d = metadata_as_dict(mi)
d.pop('cover_data', None)
serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
with lopen(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
f.write(json_dumps(d))
if extract_annotations:
annotations = None
if bookmark_data:
annotations = json_dumps(tuple(get_stored_annotations(container)))
if annotations:
with lopen(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
f.write(annotations)
def render_for_viewer(path, out_dir, book_hash):