Viewer: Run the first read book preparation in parallel

This commit is contained in:
Kovid Goyal 2019-10-21 16:20:00 +05:30
parent 243144f3b9
commit 75dd89722a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,22 +7,26 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import os import os
import re import re
import shutil
import sys import sys
import time
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from functools import partial from functools import partial
from itertools import count from itertools import count
from math import ceil
from css_parser import replaceUrls from css_parser import replaceUrls
from css_parser.css import CSSRule from css_parser.css import CSSRule
from calibre import force_unicode, prepare_string_for_xml from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
from calibre.constants import iswindows
from calibre.customize.ui import plugin_for_input_format from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks import parse_css_length from calibre.ebooks import parse_css_length
from calibre.ebooks.css_transform_rules import StyleDeclaration from calibre.ebooks.css_transform_rules import StyleDeclaration
from calibre.ebooks.oeb.base import ( from calibre.ebooks.oeb.base import (
EPUB_NS, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath, rewrite_links, EPUB_NS, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath,
urlunquote rewrite_links, urlunquote
) )
from calibre.ebooks.oeb.iterator.book import extract_book from calibre.ebooks.oeb.iterator.book import extract_book
from calibre.ebooks.oeb.polish.container import Container as ContainerBase from calibre.ebooks.oeb.polish.container import Container as ContainerBase
@ -31,9 +35,12 @@ from calibre.ebooks.oeb.polish.cover import (
) )
from calibre.ebooks.oeb.polish.css import transform_inline_styles from calibre.ebooks.oeb.polish.css import transform_inline_styles
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
from calibre.ebooks.oeb.polish.utils import extract, guess_type from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.srv.metadata import encode_datetime from calibre.srv.metadata import encode_datetime
from calibre.srv.opts import grouper
from calibre.utils.date import EPOCH from calibre.utils.date import EPOCH
from calibre.utils.ipc.simple_worker import start_pipe_worker
from calibre.utils.iso8601 import parse_iso8601 from calibre.utils.iso8601 import parse_iso8601
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.serialize import json_loads from calibre.utils.serialize import json_loads
@ -42,7 +49,9 @@ from polyglot.binary import (
as_base64_unicode as encode_component, from_base64_bytes, as_base64_unicode as encode_component, from_base64_bytes,
from_base64_unicode as decode_component from_base64_unicode as decode_component
) )
from polyglot.builtins import is_py3, iteritems, map, unicode_type from polyglot.builtins import (
as_bytes, is_py3, iteritems, itervalues, map, unicode_type
)
from polyglot.urllib import quote, urlparse from polyglot.urllib import quote, urlparse
RENDER_VERSION = 1 RENDER_VERSION = 1
@ -220,95 +229,16 @@ def toc_anchor_map(toc):
return dict(ans) return dict(ans)
class Container(ContainerBase): def serialize_parsed_html(root):
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
class SimpleContainer(ContainerBase):
tweak_mode = True tweak_mode = True
def __init__(
self, book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False,
book_metadata=None, allow_no_cover=True, virtualize_resources=True
):
log = log or default_log
self.allow_no_cover = allow_no_cover
ContainerBase.__init__(self, tdir, opfpath, log)
self.book_metadata = book_metadata
input_plugin = plugin_for_input_format(input_fmt)
self.is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
if save_bookmark_data:
bm_file = 'META-INF/calibre_bookmarks.txt'
self.bookmark_data = None
if self.exists(bm_file):
with self.open(bm_file, 'rb') as f:
self.bookmark_data = f.read()
# We do not add zero byte sized files as the IndexedDB API in the
# browser has no good way to distinguish between zero byte files and
# load failures.
excluded_names = {
name for name, mt in iteritems(self.mime_map) if
name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
name == 'mimetype' or not self.has_name_and_is_not_empty(name)}
raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())
toc = get_toc(self).to_dict(count()) def create_cover_page(container, input_fmt, allow_no_cover, book_metadata=None):
if not toc or not toc.get('children'):
toc = from_xpaths(self, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
spine = [name for name, is_linear in self.spine_names]
spineq = frozenset(spine)
landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]
self.book_render_data = data = {
'version': RENDER_VERSION,
'toc':toc,
'book_format': book_fmt,
'spine':spine,
'link_uid': uuid4(),
'book_hash': book_hash,
'is_comic': self.is_comic,
'raster_cover_name': raster_cover_name,
'title_page_name': titlepage_name,
'has_maths': False,
'total_length': 0,
'spine_length': 0,
'toc_anchor_map': toc_anchor_map(toc),
'landmarks': landmarks,
'link_to_map': {},
}
# Mark the spine as dirty since we have to ensure it is normalized
for name in data['spine']:
self.parsed(name), self.dirty(name)
self.virtualized_names = set()
self.transform_all(virtualize_resources)
def manifest_data(name):
mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
ans = {
'size':os.path.getsize(self.name_path_map[name]),
'is_virtualized': name in self.virtualized_names,
'mimetype':mt,
'is_html': mt in OEB_DOCS,
}
if ans['is_html']:
root = self.parsed(name)
ans['length'] = l = get_length(root)
self.book_render_data['total_length'] += l
if name in data['spine']:
self.book_render_data['spine_length'] += l
ans['has_maths'] = hm = check_for_maths(root)
if hm:
self.book_render_data['has_maths'] = True
ans['anchor_map'] = anchor_map(root)
return ans
data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
self.commit()
for name in excluded_names:
os.remove(self.name_path_map[name])
data = json.dumps(self.book_render_data, ensure_ascii=False)
if not isinstance(data, bytes):
data = data.encode('utf-8')
with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
f.write(data)
def create_cover_page(self, input_fmt):
templ = ''' templ = '''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head><style> <head><style>
@ -326,9 +256,9 @@ class Container(ContainerBase):
''' '''
def generic_cover(): def generic_cover():
if self.book_metadata is not None: if book_metadata is not None:
from calibre.ebooks.covers import create_cover from calibre.ebooks.covers import create_cover
mi = self.book_metadata mi = book_metadata
return create_cover(mi.title, mi.authors, mi.series, mi.series_index) return create_cover(mi.title, mi.authors, mi.series, mi.series_index)
return BLANK_JPEG return BLANK_JPEG
@ -336,144 +266,230 @@ class Container(ContainerBase):
def image_callback(cover_image, wrapped_image): def image_callback(cover_image, wrapped_image):
if cover_image: if cover_image:
image_callback.cover_data = self.raw_data(cover_image, decode=False) image_callback.cover_data = container.raw_data(cover_image, decode=False)
if wrapped_image and not getattr(image_callback, 'cover_data', None): if wrapped_image and not getattr(image_callback, 'cover_data', None):
image_callback.cover_data = self.raw_data(wrapped_image, decode=False) image_callback.cover_data = container.raw_data(wrapped_image, decode=False)
def cover_path(action, data): def cover_path(action, data):
if action == 'write_image': if action == 'write_image':
cdata = getattr(image_callback, 'cover_data', None) or generic_cover() cdata = getattr(image_callback, 'cover_data', None) or generic_cover()
data.write(cdata) data.write(cdata)
if self.allow_no_cover and not has_epub_cover(self): if allow_no_cover and not has_epub_cover(container):
return None, None return None, None
raster_cover_name, titlepage_name = set_epub_cover( raster_cover_name, titlepage_name = set_epub_cover(
self, cover_path, (lambda *a: None), options={'template':templ}, container, cover_path, (lambda *a: None), options={'template':templ},
image_callback=image_callback) image_callback=image_callback)
else: else:
raster_cover_name = find_cover_image(self, strict=True) raster_cover_name = find_cover_image(container, strict=True)
if raster_cover_name is None: if raster_cover_name is None:
if self.allow_no_cover: if allow_no_cover:
return None, None return None, None
item = self.generate_item(name='cover.jpeg', id_prefix='cover') item = container.generate_item(name='cover.jpeg', id_prefix='cover')
raster_cover_name = self.href_to_name(item.get('href'), self.opf_name) raster_cover_name = container.href_to_name(item.get('href'), container.opf_name)
with self.open(raster_cover_name, 'wb') as dest: with container.open(raster_cover_name, 'wb') as dest:
dest.write(generic_cover()) dest.write(generic_cover())
if self.is_comic: if container.is_comic:
return raster_cover_name, None return raster_cover_name, None
item = self.generate_item(name='titlepage.html', id_prefix='titlepage') item = container.generate_item(name='titlepage.html', id_prefix='titlepage')
titlepage_name = self.href_to_name(item.get('href'), self.opf_name) titlepage_name = container.href_to_name(item.get('href'), container.opf_name)
raw = templ % prepare_string_for_xml(self.name_to_href(raster_cover_name, titlepage_name), True) raw = templ % prepare_string_for_xml(container.name_to_href(raster_cover_name, titlepage_name), True)
with self.open(titlepage_name, 'wb') as f: with container.open(titlepage_name, 'wb') as f:
f.write(raw.encode('utf-8')) f.write(raw.encode('utf-8'))
spine = self.opf_xpath('//opf:spine')[0] spine = container.opf_xpath('//opf:spine')[0]
ref = spine.makeelement(OPF('itemref'), idref=item.get('id')) ref = spine.makeelement(OPF('itemref'), idref=item.get('id'))
self.insert_into_xml(spine, ref, index=0) container.insert_into_xml(spine, ref, index=0)
self.dirty(self.opf_name) container.dirty(container.opf_name)
return raster_cover_name, titlepage_name return raster_cover_name, titlepage_name
def transform_html(self, name, virtualize_resources):
style_xpath = XPath('//h:style') def transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names):
changed = False
sheet = container.parsed(name)
if virtualize_resources:
changed_names = set()
link_replacer = create_link_replacer(container, link_uid, changed_names)
replaceUrls(sheet, partial(link_replacer, name))
if name in changed_names:
changed = True
virtualized_names.add(name)
if transform_sheet(sheet):
changed = True
if changed:
raw = container.serialize_item(name)
else:
raw = container.raw_data(name, decode=False)
raw = raw.lstrip()
if not raw.startswith(b'@charset'):
raw = b'@charset "UTF-8";\n' + raw
changed = True
if changed:
with container.open(name, 'wb') as f:
f.write(raw)
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names):
if not virtualize_resources:
return
link_replacer = create_link_replacer(container, link_uid, set())
xlink = XLINK('href')
altered = False
xlink_xpath = XPath('//*[@xl:href]')
for elem in xlink_xpath(container.parsed(name)):
href = elem.get(xlink)
if not href.startswith('#'):
elem.set(xlink, link_replacer(name, href))
altered = True
if altered:
virtualized_names.add(name)
container.dirty(name)
container.commit_item(name)
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names):
link_xpath = XPath('//h:a[@href]') link_xpath = XPath('//h:a[@href]')
img_xpath = XPath('//h:img[@src]') img_xpath = XPath('//h:img[@src]')
res_link_xpath = XPath('//h:link[@href]') res_link_xpath = XPath('//h:link[@href]')
root = self.parsed(name) root = container.parsed(name)
head = ensure_head(root) changed_names = set()
changed = False link_replacer = create_link_replacer(container, link_uid, changed_names)
for style in style_xpath(root):
# Firefox flakes out sometimes when dynamically creating <style> tags,
# so convert them to external stylesheets to ensure they never fail
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
in_head = has_ancestor(style, head)
if not in_head:
extract(style)
head.append(style)
css = style.text
style.clear()
style.tag = XHTML('link')
style.set('type', 'text/css')
style.set('rel', 'stylesheet')
sname = self.add_file(name + '.css', css.encode('utf-8'), modify_name_if_needed=True)
style.set('href', self.name_to_href(sname, name))
changed = True
# Used for viewing images # Used for viewing images
for img in img_xpath(root): for img in img_xpath(root):
img_name = self.href_to_name(img.get('src'), name) img_name = container.href_to_name(img.get('src'), name)
if img_name: if img_name:
img.set('data-calibre-src', img_name) img.set('data-calibre-src', img_name)
changed = True
# Disable non stylsheet link tags. This link will not be loaded by the # Disable non-stylesheet link tags. This link will not be loaded by the
# browser anyway and will causes the resource load check to hang # browser anyway and will causes the resource load check to hang
for link in res_link_xpath(root): for link in res_link_xpath(root):
ltype = (link.get('type') or 'text/css').lower() ltype = (link.get('type') or 'text/css').lower()
rel = (link.get('rel') or 'stylesheet').lower() rel = (link.get('rel') or 'stylesheet').lower()
if ltype != 'text/css' or rel != 'stylesheet': if ltype != 'text/css' or rel != 'stylesheet':
link.attrib.clear() link.attrib.clear()
def transform_and_virtualize_sheet(sheet):
changed = transform_sheet(sheet)
if virtualize_resources:
replaceUrls(sheet, partial(link_replacer, name))
if name in changed_names:
virtualized_names.add(name)
changed = True changed = True
return changed
# Transform <style> and style="" # Transform <style> and style=""
if transform_inline_styles(self, name, transform_sheet=transform_sheet, transform_style=transform_declaration): transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration)
changed = True
if not virtualize_resources: if virtualize_resources:
link_uid = self.book_render_data['link_uid'] virtualize_html(container, name, link_uid, link_to_map, virtualized_names)
link_replacer = create_link_replacer(self, link_uid, set()) else:
ltm = self.book_render_data['link_to_map']
for a in link_xpath(root): for a in link_xpath(root):
href = link_replacer(name, a.get('href')) href = link_replacer(name, a.get('href'))
if href and href.startswith(link_uid): if href and href.startswith(link_uid):
a.set('href', 'javascript:void(0)') a.set('href', 'javascript:void(0)')
parts = decode_url(href.split('|')[1]) parts = decode_url(href.split('|')[1])
lname, lfrag = parts[0], parts[1] lname, lfrag = parts[0], parts[1]
ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
changed = True
if changed: shtml = serialize_parsed_html(root)
self.dirty(name) with container.open(name, 'wb') as f:
f.write(shtml)
def transform_css(self, name):
sheet = self.parsed(name)
if transform_sheet(sheet):
self.dirty(name)
def transform_all(self, virtualize_resources): class RenderManager(object):
for name, mt in tuple(iteritems(self.mime_map)):
mt = mt.lower()
if mt in OEB_DOCS:
self.transform_html(name, virtualize_resources)
for name, mt in tuple(iteritems(self.mime_map)):
mt = mt.lower()
if mt in OEB_STYLES:
self.transform_css(name)
if virtualize_resources:
self.virtualize_resources()
ltm = self.book_render_data['link_to_map'] def launch_worker(self):
for name, amap in iteritems(ltm): with lopen(os.path.join(self.tdir, '{}.json'.format(len(self.workers))), 'wb') as output:
for k, v in tuple(iteritems(amap)): error = lopen(os.path.join(self.tdir, '{}.error'.format(len(self.workers))), 'wb')
amap[k] = tuple(v) # needed for JSON serialization p = start_pipe_worker('from calibre.srv.render_book import worker_main; worker_main()', stdout=error, stderr=error)
p.output_path = output.name
p.error_path = error.name
self.workers.append(p)
def virtualize_resources(self): def __enter__(self):
self.workers = []
self.tdir = PersistentTemporaryDirectory()
self.launch_worker(), self.launch_worker()
return self
def __exit__(self, *a):
while self.workers:
p = self.workers.pop()
if p.returncode is None:
p.terminate()
if not iswindows and p.poll() is None:
time.sleep(0.02)
if p.poll() is None:
p.kill()
del self.workers
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
time.sleep(0.1)
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
pass
del self.tdir
def __call__(self, names, args, in_process_container):
num_workers = min(detect_ncpus(), len(names))
if num_workers > 1:
total_sz = sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names)
if total_sz < 128 * 1024:
num_workers = 1
if num_workers == 1:
return [process_book_files(names, *args, container=in_process_container)]
while len(self.workers) < num_workers:
self.launch_worker()
group_sz = int(ceil(len(names) / num_workers))
for group, worker in zip(grouper(group_sz, names), self.workers):
worker.stdin.write(as_bytes(json.dumps((worker.output_path, group,) + args)))
worker.stdin.flush(), worker.stdin.close()
worker.job_sent = True
for worker in self.workers:
if not hasattr(worker, 'job_sent'):
worker.stdin.write(b'_'), worker.stdin.flush(), worker.stdin.close()
error = None
results = []
for worker in self.workers:
if not hasattr(worker, 'job_sent'):
worker.wait()
continue
if worker.wait() != 0:
with lopen(worker.error_path, 'rb') as f:
error = f.read().decode('utf-8', 'replace')
else:
with lopen(worker.output_path, 'rb') as f:
results.append(json.loads(f.read()))
if error is not None:
raise Exception('Render worker failed with error:\n' + error)
return results
def worker_main():
stdin = getattr(sys.stdin, 'buffer', sys.stdin)
raw = stdin.read()
if raw == b'_':
return
args = json.loads(raw)
result = process_book_files(*args[1:])
with open(args[0], 'wb') as f:
f.write(as_bytes(json.dumps(result)))
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
changed = set() changed = set()
link_uid = self.book_render_data['link_uid']
xlink_xpath = XPath('//*[@xl:href]')
link_xpath = XPath('//h:a[@href]') link_xpath = XPath('//h:a[@href]')
link_replacer = create_link_replacer(self, link_uid, changed) link_replacer = create_link_replacer(container, link_uid, changed)
ltm = self.book_render_data['link_to_map'] virtualized_names.add(name)
root = container.parsed(name)
for name, mt in iteritems(self.mime_map):
mt = mt.lower()
if mt in OEB_STYLES:
replaceUrls(self.parsed(name), partial(link_replacer, name))
self.virtualized_names.add(name)
elif mt in OEB_DOCS:
self.virtualized_names.add(name)
root = self.parsed(name)
rewrite_links(root, partial(link_replacer, name)) rewrite_links(root, partial(link_replacer, name))
for a in link_xpath(root): for a in link_xpath(root):
href = a.get('href') href = a.get('href')
@ -481,36 +497,161 @@ class Container(ContainerBase):
a.set('href', 'javascript:void(0)') a.set('href', 'javascript:void(0)')
parts = decode_url(href.split('|')[1]) parts = decode_url(href.split('|')[1])
lname, lfrag = parts[0], parts[1] lname, lfrag = parts[0], parts[1]
ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
else: else:
a.set('target', '_blank') a.set('target', '_blank')
a.set('rel', 'noopener noreferrer') a.set('rel', 'noopener noreferrer')
return name in changed
def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, container=None):
container = container or SimpleContainer(container_dir, opfpath, default_log)
link_to_map = {}
html_data = {}
virtualized_names = set()
for name in names:
if name is None:
continue
mt = container.mime_map[name].lower()
if mt in OEB_DOCS:
root = container.parsed(name)
html_data[name] = {
'length': get_length(root),
'has_maths': check_for_maths(root),
'anchor_map': anchor_map(root)
}
transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names)
elif mt in OEB_STYLES:
transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names)
elif mt == 'image/svg+xml': elif mt == 'image/svg+xml':
self.virtualized_names.add(name) transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names)
xlink = XLINK('href') for v in itervalues(link_to_map):
altered = False for k in v:
for elem in xlink_xpath(self.parsed(name)): v[k] = tuple(v[k])
href = elem.get(xlink) return link_to_map, html_data, tuple(virtualized_names)
if not href.startswith('#'):
elem.set(xlink, link_replacer(name, href))
altered = True
if altered:
changed.add(name)
tuple(map(self.dirty, changed))
def serialize_item(self, name): def process_exploded_book(
mt = (self.mime_map[name] or '').lower() book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False,
if mt in OEB_STYLES: book_metadata=None, allow_no_cover=True, virtualize_resources=True
ans = ContainerBase.serialize_item(self, name).lstrip() ):
if not ans.startswith(b'@charset'): log = log or default_log
ans = b'@charset "UTF-8";\n' + ans container = SimpleContainer(tdir, opfpath, log)
input_plugin = plugin_for_input_format(input_fmt)
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
bookmark_data = None
if save_bookmark_data:
bm_file = 'META-INF/calibre_bookmarks.txt'
if container.exists(bm_file):
with container.open(bm_file, 'rb') as f:
bookmark_data = f.read()
# We do not add zero byte sized files as the IndexedDB API in the
# browser has no good way to distinguish between zero byte files and
# load failures.
excluded_names = {
name for name, mt in iteritems(container.mime_map) if
name == container.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
name == 'mimetype' or not container.has_name_and_is_not_empty(name)}
raster_cover_name, titlepage_name = create_cover_page(container, input_fmt.lower(), allow_no_cover, book_metadata)
toc = get_toc(container, verify_destinations=False).to_dict(count())
if not toc or not toc.get('children'):
toc = from_xpaths(container, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
spine = [name for name, is_linear in container.spine_names]
spineq = frozenset(spine)
landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq]
book_render_data = {
'version': RENDER_VERSION,
'toc':toc,
'book_format': book_fmt,
'spine':spine,
'link_uid': uuid4(),
'book_hash': book_hash,
'is_comic': is_comic,
'raster_cover_name': raster_cover_name,
'title_page_name': titlepage_name,
'has_maths': False,
'total_length': 0,
'spine_length': 0,
'toc_anchor_map': toc_anchor_map(toc),
'landmarks': landmarks,
'link_to_map': {},
}
def work_priority(name):
# ensure workers with large files or stylesheets
# have the less names
size = os.path.getsize(container.name_path_map[name]),
is_html = container.mime_map.get(name) in OEB_DOCS
return (0 if is_html else 1), size
names = sorted(
(n for n, mt in iteritems(container.mime_map) if mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'),
key=work_priority)
results = render_manager(names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid']), container)
ltm = book_render_data['link_to_map']
html_data = {}
virtualized_names = set()
def merge_ltm(dest, src):
for k, v in iteritems(src):
if k in dest:
dest[k] |= v
else:
dest[k] = v
for link_to_map, hdata, vnames in results:
html_data.update(hdata)
virtualized_names |= set(vnames)
for k, v in iteritems(link_to_map):
for x in v:
v[x] = set(v[x])
if k in ltm:
merge_ltm(ltm[k], v)
else:
ltm[k] = v
def manifest_data(name):
mt = (container.mime_map.get(name) or 'application/octet-stream').lower()
ans = {
'size':os.path.getsize(container.name_path_map[name]),
'is_virtualized': name in virtualized_names,
'mimetype':mt,
'is_html': mt in OEB_DOCS,
}
if ans['is_html']:
data = html_data[name]
ans['length'] = l = data['length']
book_render_data['total_length'] += l
if name in book_render_data['spine']:
book_render_data['spine_length'] += l
ans['has_maths'] = hm = data['has_maths']
if hm:
book_render_data['has_maths'] = True
ans['anchor_map'] = data['anchor_map']
return ans return ans
if mt not in OEB_DOCS:
return ContainerBase.serialize_item(self, name) book_render_data['files'] = {name:manifest_data(name) for name in set(container.name_path_map) - excluded_names}
root = self.parsed(name) container.commit()
return json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')).encode('utf-8')
for name in excluded_names:
os.remove(container.name_path_map[name])
ltm = book_render_data['link_to_map']
for name, amap in iteritems(ltm):
for k, v in tuple(iteritems(amap)):
amap[k] = tuple(v) # needed for JSON serialization
data = as_bytes(json.dumps(book_render_data, ensure_ascii=False))
with lopen(os.path.join(container.root, 'calibre-book-manifest.json'), 'wb') as f:
f.write(data)
return container, bookmark_data
def split_name(name): def split_name(name):
@ -566,22 +707,6 @@ def serialize_elem(elem, nsmap):
return ans return ans
def ensure_head(root):
# Make sure we have only a single <head>
heads = list(root.iterchildren(XHTML('head')))
if len(heads) != 1:
if not heads:
root.insert(0, root.makeelement(XHTML('head')))
return root[0]
head = heads[0]
for eh in heads[1:]:
for child in eh.iterchildren('*'):
head.append(child)
extract(eh)
return head
return heads[0]
def ensure_body(root): def ensure_body(root):
# Make sure we have only a single <body> # Make sure we have only a single <body>
bodies = list(root.iterchildren(XHTML('body'))) bodies = list(root.iterchildren(XHTML('body')))
@ -685,6 +810,7 @@ def get_stored_annotations(container):
def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True): def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True):
with RenderManager() as render_manager:
mi = None mi = None
if serialize_metadata: if serialize_metadata:
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
@ -692,9 +818,9 @@ def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, ex
with lopen(pathtoebook, 'rb') as f, quick_metadata: with lopen(pathtoebook, 'rb') as f, quick_metadata:
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower()) mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log) book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
container = Container( container, bookmark_data = process_exploded_book(
book_fmt, opfpath, input_fmt, output_dir, book_hash=book_hash, book_fmt, opfpath, input_fmt, output_dir, render_manager,
save_bookmark_data=extract_annotations, book_hash=book_hash, save_bookmark_data=extract_annotations,
book_metadata=mi, virtualize_resources=virtualize_resources book_metadata=mi, virtualize_resources=virtualize_resources
) )
if serialize_metadata: if serialize_metadata:
@ -707,7 +833,7 @@ def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, ex
f.write(json_dumps(d)) f.write(json_dumps(d))
if extract_annotations: if extract_annotations:
annotations = None annotations = None
if container.bookmark_data: if bookmark_data:
annotations = json_dumps(tuple(get_stored_annotations(container))) annotations = json_dumps(tuple(get_stored_annotations(container)))
if annotations: if annotations:
with lopen(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f: with lopen(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f: