mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Viewer: Run the first read book preparation in parallel
This commit is contained in:
parent
243144f3b9
commit
75dd89722a
@ -7,22 +7,26 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import count
|
from itertools import count
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
from css_parser import replaceUrls
|
from css_parser import replaceUrls
|
||||||
from css_parser.css import CSSRule
|
from css_parser.css import CSSRule
|
||||||
|
|
||||||
from calibre import force_unicode, prepare_string_for_xml
|
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
|
||||||
|
from calibre.constants import iswindows
|
||||||
from calibre.customize.ui import plugin_for_input_format
|
from calibre.customize.ui import plugin_for_input_format
|
||||||
from calibre.ebooks import parse_css_length
|
from calibre.ebooks import parse_css_length
|
||||||
from calibre.ebooks.css_transform_rules import StyleDeclaration
|
from calibre.ebooks.css_transform_rules import StyleDeclaration
|
||||||
from calibre.ebooks.oeb.base import (
|
from calibre.ebooks.oeb.base import (
|
||||||
EPUB_NS, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath, rewrite_links,
|
EPUB_NS, OEB_DOCS, OEB_STYLES, OPF, XHTML, XHTML_NS, XLINK, XPath as _XPath,
|
||||||
urlunquote
|
rewrite_links, urlunquote
|
||||||
)
|
)
|
||||||
from calibre.ebooks.oeb.iterator.book import extract_book
|
from calibre.ebooks.oeb.iterator.book import extract_book
|
||||||
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
from calibre.ebooks.oeb.polish.container import Container as ContainerBase
|
||||||
@ -31,9 +35,12 @@ from calibre.ebooks.oeb.polish.cover import (
|
|||||||
)
|
)
|
||||||
from calibre.ebooks.oeb.polish.css import transform_inline_styles
|
from calibre.ebooks.oeb.polish.css import transform_inline_styles
|
||||||
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
|
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
|
||||||
from calibre.ebooks.oeb.polish.utils import extract, guess_type
|
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||||
|
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||||
from calibre.srv.metadata import encode_datetime
|
from calibre.srv.metadata import encode_datetime
|
||||||
|
from calibre.srv.opts import grouper
|
||||||
from calibre.utils.date import EPOCH
|
from calibre.utils.date import EPOCH
|
||||||
|
from calibre.utils.ipc.simple_worker import start_pipe_worker
|
||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.serialize import json_loads
|
from calibre.utils.serialize import json_loads
|
||||||
@ -42,7 +49,9 @@ from polyglot.binary import (
|
|||||||
as_base64_unicode as encode_component, from_base64_bytes,
|
as_base64_unicode as encode_component, from_base64_bytes,
|
||||||
from_base64_unicode as decode_component
|
from_base64_unicode as decode_component
|
||||||
)
|
)
|
||||||
from polyglot.builtins import is_py3, iteritems, map, unicode_type
|
from polyglot.builtins import (
|
||||||
|
as_bytes, is_py3, iteritems, itervalues, map, unicode_type
|
||||||
|
)
|
||||||
from polyglot.urllib import quote, urlparse
|
from polyglot.urllib import quote, urlparse
|
||||||
|
|
||||||
RENDER_VERSION = 1
|
RENDER_VERSION = 1
|
||||||
@ -220,95 +229,16 @@ def toc_anchor_map(toc):
|
|||||||
return dict(ans)
|
return dict(ans)
|
||||||
|
|
||||||
|
|
||||||
class Container(ContainerBase):
|
def serialize_parsed_html(root):
|
||||||
|
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleContainer(ContainerBase):
|
||||||
|
|
||||||
tweak_mode = True
|
tweak_mode = True
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False,
|
|
||||||
book_metadata=None, allow_no_cover=True, virtualize_resources=True
|
|
||||||
):
|
|
||||||
log = log or default_log
|
|
||||||
self.allow_no_cover = allow_no_cover
|
|
||||||
ContainerBase.__init__(self, tdir, opfpath, log)
|
|
||||||
self.book_metadata = book_metadata
|
|
||||||
input_plugin = plugin_for_input_format(input_fmt)
|
|
||||||
self.is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
|
||||||
if save_bookmark_data:
|
|
||||||
bm_file = 'META-INF/calibre_bookmarks.txt'
|
|
||||||
self.bookmark_data = None
|
|
||||||
if self.exists(bm_file):
|
|
||||||
with self.open(bm_file, 'rb') as f:
|
|
||||||
self.bookmark_data = f.read()
|
|
||||||
# We do not add zero byte sized files as the IndexedDB API in the
|
|
||||||
# browser has no good way to distinguish between zero byte files and
|
|
||||||
# load failures.
|
|
||||||
excluded_names = {
|
|
||||||
name for name, mt in iteritems(self.mime_map) if
|
|
||||||
name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
|
|
||||||
name == 'mimetype' or not self.has_name_and_is_not_empty(name)}
|
|
||||||
raster_cover_name, titlepage_name = self.create_cover_page(input_fmt.lower())
|
|
||||||
|
|
||||||
toc = get_toc(self).to_dict(count())
|
def create_cover_page(container, input_fmt, allow_no_cover, book_metadata=None):
|
||||||
if not toc or not toc.get('children'):
|
|
||||||
toc = from_xpaths(self, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
|
|
||||||
spine = [name for name, is_linear in self.spine_names]
|
|
||||||
spineq = frozenset(spine)
|
|
||||||
landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq]
|
|
||||||
|
|
||||||
self.book_render_data = data = {
|
|
||||||
'version': RENDER_VERSION,
|
|
||||||
'toc':toc,
|
|
||||||
'book_format': book_fmt,
|
|
||||||
'spine':spine,
|
|
||||||
'link_uid': uuid4(),
|
|
||||||
'book_hash': book_hash,
|
|
||||||
'is_comic': self.is_comic,
|
|
||||||
'raster_cover_name': raster_cover_name,
|
|
||||||
'title_page_name': titlepage_name,
|
|
||||||
'has_maths': False,
|
|
||||||
'total_length': 0,
|
|
||||||
'spine_length': 0,
|
|
||||||
'toc_anchor_map': toc_anchor_map(toc),
|
|
||||||
'landmarks': landmarks,
|
|
||||||
'link_to_map': {},
|
|
||||||
}
|
|
||||||
# Mark the spine as dirty since we have to ensure it is normalized
|
|
||||||
for name in data['spine']:
|
|
||||||
self.parsed(name), self.dirty(name)
|
|
||||||
self.virtualized_names = set()
|
|
||||||
self.transform_all(virtualize_resources)
|
|
||||||
|
|
||||||
def manifest_data(name):
|
|
||||||
mt = (self.mime_map.get(name) or 'application/octet-stream').lower()
|
|
||||||
ans = {
|
|
||||||
'size':os.path.getsize(self.name_path_map[name]),
|
|
||||||
'is_virtualized': name in self.virtualized_names,
|
|
||||||
'mimetype':mt,
|
|
||||||
'is_html': mt in OEB_DOCS,
|
|
||||||
}
|
|
||||||
if ans['is_html']:
|
|
||||||
root = self.parsed(name)
|
|
||||||
ans['length'] = l = get_length(root)
|
|
||||||
self.book_render_data['total_length'] += l
|
|
||||||
if name in data['spine']:
|
|
||||||
self.book_render_data['spine_length'] += l
|
|
||||||
ans['has_maths'] = hm = check_for_maths(root)
|
|
||||||
if hm:
|
|
||||||
self.book_render_data['has_maths'] = True
|
|
||||||
ans['anchor_map'] = anchor_map(root)
|
|
||||||
return ans
|
|
||||||
data['files'] = {name:manifest_data(name) for name in set(self.name_path_map) - excluded_names}
|
|
||||||
self.commit()
|
|
||||||
for name in excluded_names:
|
|
||||||
os.remove(self.name_path_map[name])
|
|
||||||
data = json.dumps(self.book_render_data, ensure_ascii=False)
|
|
||||||
if not isinstance(data, bytes):
|
|
||||||
data = data.encode('utf-8')
|
|
||||||
with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f:
|
|
||||||
f.write(data)
|
|
||||||
|
|
||||||
def create_cover_page(self, input_fmt):
|
|
||||||
templ = '''
|
templ = '''
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||||
<head><style>
|
<head><style>
|
||||||
@ -326,9 +256,9 @@ class Container(ContainerBase):
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
def generic_cover():
|
def generic_cover():
|
||||||
if self.book_metadata is not None:
|
if book_metadata is not None:
|
||||||
from calibre.ebooks.covers import create_cover
|
from calibre.ebooks.covers import create_cover
|
||||||
mi = self.book_metadata
|
mi = book_metadata
|
||||||
return create_cover(mi.title, mi.authors, mi.series, mi.series_index)
|
return create_cover(mi.title, mi.authors, mi.series, mi.series_index)
|
||||||
return BLANK_JPEG
|
return BLANK_JPEG
|
||||||
|
|
||||||
@ -336,144 +266,230 @@ class Container(ContainerBase):
|
|||||||
|
|
||||||
def image_callback(cover_image, wrapped_image):
|
def image_callback(cover_image, wrapped_image):
|
||||||
if cover_image:
|
if cover_image:
|
||||||
image_callback.cover_data = self.raw_data(cover_image, decode=False)
|
image_callback.cover_data = container.raw_data(cover_image, decode=False)
|
||||||
if wrapped_image and not getattr(image_callback, 'cover_data', None):
|
if wrapped_image and not getattr(image_callback, 'cover_data', None):
|
||||||
image_callback.cover_data = self.raw_data(wrapped_image, decode=False)
|
image_callback.cover_data = container.raw_data(wrapped_image, decode=False)
|
||||||
|
|
||||||
def cover_path(action, data):
|
def cover_path(action, data):
|
||||||
if action == 'write_image':
|
if action == 'write_image':
|
||||||
cdata = getattr(image_callback, 'cover_data', None) or generic_cover()
|
cdata = getattr(image_callback, 'cover_data', None) or generic_cover()
|
||||||
data.write(cdata)
|
data.write(cdata)
|
||||||
|
|
||||||
if self.allow_no_cover and not has_epub_cover(self):
|
if allow_no_cover and not has_epub_cover(container):
|
||||||
return None, None
|
return None, None
|
||||||
raster_cover_name, titlepage_name = set_epub_cover(
|
raster_cover_name, titlepage_name = set_epub_cover(
|
||||||
self, cover_path, (lambda *a: None), options={'template':templ},
|
container, cover_path, (lambda *a: None), options={'template':templ},
|
||||||
image_callback=image_callback)
|
image_callback=image_callback)
|
||||||
else:
|
else:
|
||||||
raster_cover_name = find_cover_image(self, strict=True)
|
raster_cover_name = find_cover_image(container, strict=True)
|
||||||
if raster_cover_name is None:
|
if raster_cover_name is None:
|
||||||
if self.allow_no_cover:
|
if allow_no_cover:
|
||||||
return None, None
|
return None, None
|
||||||
item = self.generate_item(name='cover.jpeg', id_prefix='cover')
|
item = container.generate_item(name='cover.jpeg', id_prefix='cover')
|
||||||
raster_cover_name = self.href_to_name(item.get('href'), self.opf_name)
|
raster_cover_name = container.href_to_name(item.get('href'), container.opf_name)
|
||||||
with self.open(raster_cover_name, 'wb') as dest:
|
with container.open(raster_cover_name, 'wb') as dest:
|
||||||
dest.write(generic_cover())
|
dest.write(generic_cover())
|
||||||
if self.is_comic:
|
if container.is_comic:
|
||||||
return raster_cover_name, None
|
return raster_cover_name, None
|
||||||
item = self.generate_item(name='titlepage.html', id_prefix='titlepage')
|
item = container.generate_item(name='titlepage.html', id_prefix='titlepage')
|
||||||
titlepage_name = self.href_to_name(item.get('href'), self.opf_name)
|
titlepage_name = container.href_to_name(item.get('href'), container.opf_name)
|
||||||
raw = templ % prepare_string_for_xml(self.name_to_href(raster_cover_name, titlepage_name), True)
|
raw = templ % prepare_string_for_xml(container.name_to_href(raster_cover_name, titlepage_name), True)
|
||||||
with self.open(titlepage_name, 'wb') as f:
|
with container.open(titlepage_name, 'wb') as f:
|
||||||
f.write(raw.encode('utf-8'))
|
f.write(raw.encode('utf-8'))
|
||||||
spine = self.opf_xpath('//opf:spine')[0]
|
spine = container.opf_xpath('//opf:spine')[0]
|
||||||
ref = spine.makeelement(OPF('itemref'), idref=item.get('id'))
|
ref = spine.makeelement(OPF('itemref'), idref=item.get('id'))
|
||||||
self.insert_into_xml(spine, ref, index=0)
|
container.insert_into_xml(spine, ref, index=0)
|
||||||
self.dirty(self.opf_name)
|
container.dirty(container.opf_name)
|
||||||
return raster_cover_name, titlepage_name
|
return raster_cover_name, titlepage_name
|
||||||
|
|
||||||
def transform_html(self, name, virtualize_resources):
|
|
||||||
style_xpath = XPath('//h:style')
|
def transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names):
|
||||||
|
changed = False
|
||||||
|
sheet = container.parsed(name)
|
||||||
|
if virtualize_resources:
|
||||||
|
changed_names = set()
|
||||||
|
link_replacer = create_link_replacer(container, link_uid, changed_names)
|
||||||
|
replaceUrls(sheet, partial(link_replacer, name))
|
||||||
|
if name in changed_names:
|
||||||
|
changed = True
|
||||||
|
virtualized_names.add(name)
|
||||||
|
if transform_sheet(sheet):
|
||||||
|
changed = True
|
||||||
|
if changed:
|
||||||
|
raw = container.serialize_item(name)
|
||||||
|
else:
|
||||||
|
raw = container.raw_data(name, decode=False)
|
||||||
|
raw = raw.lstrip()
|
||||||
|
if not raw.startswith(b'@charset'):
|
||||||
|
raw = b'@charset "UTF-8";\n' + raw
|
||||||
|
changed = True
|
||||||
|
if changed:
|
||||||
|
with container.open(name, 'wb') as f:
|
||||||
|
f.write(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names):
|
||||||
|
if not virtualize_resources:
|
||||||
|
return
|
||||||
|
link_replacer = create_link_replacer(container, link_uid, set())
|
||||||
|
xlink = XLINK('href')
|
||||||
|
altered = False
|
||||||
|
xlink_xpath = XPath('//*[@xl:href]')
|
||||||
|
for elem in xlink_xpath(container.parsed(name)):
|
||||||
|
href = elem.get(xlink)
|
||||||
|
if not href.startswith('#'):
|
||||||
|
elem.set(xlink, link_replacer(name, href))
|
||||||
|
altered = True
|
||||||
|
if altered:
|
||||||
|
virtualized_names.add(name)
|
||||||
|
container.dirty(name)
|
||||||
|
container.commit_item(name)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names):
|
||||||
link_xpath = XPath('//h:a[@href]')
|
link_xpath = XPath('//h:a[@href]')
|
||||||
img_xpath = XPath('//h:img[@src]')
|
img_xpath = XPath('//h:img[@src]')
|
||||||
res_link_xpath = XPath('//h:link[@href]')
|
res_link_xpath = XPath('//h:link[@href]')
|
||||||
root = self.parsed(name)
|
root = container.parsed(name)
|
||||||
head = ensure_head(root)
|
changed_names = set()
|
||||||
changed = False
|
link_replacer = create_link_replacer(container, link_uid, changed_names)
|
||||||
for style in style_xpath(root):
|
|
||||||
# Firefox flakes out sometimes when dynamically creating <style> tags,
|
|
||||||
# so convert them to external stylesheets to ensure they never fail
|
|
||||||
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
|
|
||||||
in_head = has_ancestor(style, head)
|
|
||||||
if not in_head:
|
|
||||||
extract(style)
|
|
||||||
head.append(style)
|
|
||||||
css = style.text
|
|
||||||
style.clear()
|
|
||||||
style.tag = XHTML('link')
|
|
||||||
style.set('type', 'text/css')
|
|
||||||
style.set('rel', 'stylesheet')
|
|
||||||
sname = self.add_file(name + '.css', css.encode('utf-8'), modify_name_if_needed=True)
|
|
||||||
style.set('href', self.name_to_href(sname, name))
|
|
||||||
changed = True
|
|
||||||
|
|
||||||
# Used for viewing images
|
# Used for viewing images
|
||||||
for img in img_xpath(root):
|
for img in img_xpath(root):
|
||||||
img_name = self.href_to_name(img.get('src'), name)
|
img_name = container.href_to_name(img.get('src'), name)
|
||||||
if img_name:
|
if img_name:
|
||||||
img.set('data-calibre-src', img_name)
|
img.set('data-calibre-src', img_name)
|
||||||
changed = True
|
|
||||||
|
|
||||||
# Disable non stylsheet link tags. This link will not be loaded by the
|
# Disable non-stylesheet link tags. This link will not be loaded by the
|
||||||
# browser anyway and will causes the resource load check to hang
|
# browser anyway and will causes the resource load check to hang
|
||||||
for link in res_link_xpath(root):
|
for link in res_link_xpath(root):
|
||||||
ltype = (link.get('type') or 'text/css').lower()
|
ltype = (link.get('type') or 'text/css').lower()
|
||||||
rel = (link.get('rel') or 'stylesheet').lower()
|
rel = (link.get('rel') or 'stylesheet').lower()
|
||||||
if ltype != 'text/css' or rel != 'stylesheet':
|
if ltype != 'text/css' or rel != 'stylesheet':
|
||||||
link.attrib.clear()
|
link.attrib.clear()
|
||||||
|
|
||||||
|
def transform_and_virtualize_sheet(sheet):
|
||||||
|
changed = transform_sheet(sheet)
|
||||||
|
if virtualize_resources:
|
||||||
|
replaceUrls(sheet, partial(link_replacer, name))
|
||||||
|
if name in changed_names:
|
||||||
|
virtualized_names.add(name)
|
||||||
changed = True
|
changed = True
|
||||||
|
return changed
|
||||||
|
|
||||||
# Transform <style> and style=""
|
# Transform <style> and style=""
|
||||||
if transform_inline_styles(self, name, transform_sheet=transform_sheet, transform_style=transform_declaration):
|
transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration)
|
||||||
changed = True
|
|
||||||
|
|
||||||
if not virtualize_resources:
|
if virtualize_resources:
|
||||||
link_uid = self.book_render_data['link_uid']
|
virtualize_html(container, name, link_uid, link_to_map, virtualized_names)
|
||||||
link_replacer = create_link_replacer(self, link_uid, set())
|
else:
|
||||||
ltm = self.book_render_data['link_to_map']
|
|
||||||
for a in link_xpath(root):
|
for a in link_xpath(root):
|
||||||
href = link_replacer(name, a.get('href'))
|
href = link_replacer(name, a.get('href'))
|
||||||
if href and href.startswith(link_uid):
|
if href and href.startswith(link_uid):
|
||||||
a.set('href', 'javascript:void(0)')
|
a.set('href', 'javascript:void(0)')
|
||||||
parts = decode_url(href.split('|')[1])
|
parts = decode_url(href.split('|')[1])
|
||||||
lname, lfrag = parts[0], parts[1]
|
lname, lfrag = parts[0], parts[1]
|
||||||
ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
|
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
|
||||||
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
|
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
|
||||||
changed = True
|
|
||||||
|
|
||||||
if changed:
|
shtml = serialize_parsed_html(root)
|
||||||
self.dirty(name)
|
with container.open(name, 'wb') as f:
|
||||||
|
f.write(shtml)
|
||||||
|
|
||||||
def transform_css(self, name):
|
|
||||||
sheet = self.parsed(name)
|
|
||||||
if transform_sheet(sheet):
|
|
||||||
self.dirty(name)
|
|
||||||
|
|
||||||
def transform_all(self, virtualize_resources):
|
class RenderManager(object):
|
||||||
for name, mt in tuple(iteritems(self.mime_map)):
|
|
||||||
mt = mt.lower()
|
|
||||||
if mt in OEB_DOCS:
|
|
||||||
self.transform_html(name, virtualize_resources)
|
|
||||||
for name, mt in tuple(iteritems(self.mime_map)):
|
|
||||||
mt = mt.lower()
|
|
||||||
if mt in OEB_STYLES:
|
|
||||||
self.transform_css(name)
|
|
||||||
if virtualize_resources:
|
|
||||||
self.virtualize_resources()
|
|
||||||
|
|
||||||
ltm = self.book_render_data['link_to_map']
|
def launch_worker(self):
|
||||||
for name, amap in iteritems(ltm):
|
with lopen(os.path.join(self.tdir, '{}.json'.format(len(self.workers))), 'wb') as output:
|
||||||
for k, v in tuple(iteritems(amap)):
|
error = lopen(os.path.join(self.tdir, '{}.error'.format(len(self.workers))), 'wb')
|
||||||
amap[k] = tuple(v) # needed for JSON serialization
|
p = start_pipe_worker('from calibre.srv.render_book import worker_main; worker_main()', stdout=error, stderr=error)
|
||||||
|
p.output_path = output.name
|
||||||
|
p.error_path = error.name
|
||||||
|
self.workers.append(p)
|
||||||
|
|
||||||
def virtualize_resources(self):
|
def __enter__(self):
|
||||||
|
self.workers = []
|
||||||
|
self.tdir = PersistentTemporaryDirectory()
|
||||||
|
self.launch_worker(), self.launch_worker()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *a):
|
||||||
|
while self.workers:
|
||||||
|
p = self.workers.pop()
|
||||||
|
if p.returncode is None:
|
||||||
|
p.terminate()
|
||||||
|
if not iswindows and p.poll() is None:
|
||||||
|
time.sleep(0.02)
|
||||||
|
if p.poll() is None:
|
||||||
|
p.kill()
|
||||||
|
del self.workers
|
||||||
|
try:
|
||||||
|
shutil.rmtree(self.tdir)
|
||||||
|
except EnvironmentError:
|
||||||
|
time.sleep(0.1)
|
||||||
|
try:
|
||||||
|
shutil.rmtree(self.tdir)
|
||||||
|
except EnvironmentError:
|
||||||
|
pass
|
||||||
|
del self.tdir
|
||||||
|
|
||||||
|
def __call__(self, names, args, in_process_container):
|
||||||
|
num_workers = min(detect_ncpus(), len(names))
|
||||||
|
if num_workers > 1:
|
||||||
|
total_sz = sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names)
|
||||||
|
if total_sz < 128 * 1024:
|
||||||
|
num_workers = 1
|
||||||
|
if num_workers == 1:
|
||||||
|
return [process_book_files(names, *args, container=in_process_container)]
|
||||||
|
while len(self.workers) < num_workers:
|
||||||
|
self.launch_worker()
|
||||||
|
|
||||||
|
group_sz = int(ceil(len(names) / num_workers))
|
||||||
|
for group, worker in zip(grouper(group_sz, names), self.workers):
|
||||||
|
worker.stdin.write(as_bytes(json.dumps((worker.output_path, group,) + args)))
|
||||||
|
worker.stdin.flush(), worker.stdin.close()
|
||||||
|
worker.job_sent = True
|
||||||
|
|
||||||
|
for worker in self.workers:
|
||||||
|
if not hasattr(worker, 'job_sent'):
|
||||||
|
worker.stdin.write(b'_'), worker.stdin.flush(), worker.stdin.close()
|
||||||
|
|
||||||
|
error = None
|
||||||
|
results = []
|
||||||
|
for worker in self.workers:
|
||||||
|
if not hasattr(worker, 'job_sent'):
|
||||||
|
worker.wait()
|
||||||
|
continue
|
||||||
|
if worker.wait() != 0:
|
||||||
|
with lopen(worker.error_path, 'rb') as f:
|
||||||
|
error = f.read().decode('utf-8', 'replace')
|
||||||
|
else:
|
||||||
|
with lopen(worker.output_path, 'rb') as f:
|
||||||
|
results.append(json.loads(f.read()))
|
||||||
|
if error is not None:
|
||||||
|
raise Exception('Render worker failed with error:\n' + error)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def worker_main():
|
||||||
|
stdin = getattr(sys.stdin, 'buffer', sys.stdin)
|
||||||
|
raw = stdin.read()
|
||||||
|
if raw == b'_':
|
||||||
|
return
|
||||||
|
args = json.loads(raw)
|
||||||
|
result = process_book_files(*args[1:])
|
||||||
|
with open(args[0], 'wb') as f:
|
||||||
|
f.write(as_bytes(json.dumps(result)))
|
||||||
|
|
||||||
|
|
||||||
|
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):
|
||||||
|
|
||||||
changed = set()
|
changed = set()
|
||||||
link_uid = self.book_render_data['link_uid']
|
|
||||||
xlink_xpath = XPath('//*[@xl:href]')
|
|
||||||
link_xpath = XPath('//h:a[@href]')
|
link_xpath = XPath('//h:a[@href]')
|
||||||
link_replacer = create_link_replacer(self, link_uid, changed)
|
link_replacer = create_link_replacer(container, link_uid, changed)
|
||||||
|
|
||||||
ltm = self.book_render_data['link_to_map']
|
virtualized_names.add(name)
|
||||||
|
root = container.parsed(name)
|
||||||
for name, mt in iteritems(self.mime_map):
|
|
||||||
mt = mt.lower()
|
|
||||||
if mt in OEB_STYLES:
|
|
||||||
replaceUrls(self.parsed(name), partial(link_replacer, name))
|
|
||||||
self.virtualized_names.add(name)
|
|
||||||
elif mt in OEB_DOCS:
|
|
||||||
self.virtualized_names.add(name)
|
|
||||||
root = self.parsed(name)
|
|
||||||
rewrite_links(root, partial(link_replacer, name))
|
rewrite_links(root, partial(link_replacer, name))
|
||||||
for a in link_xpath(root):
|
for a in link_xpath(root):
|
||||||
href = a.get('href')
|
href = a.get('href')
|
||||||
@ -481,36 +497,161 @@ class Container(ContainerBase):
|
|||||||
a.set('href', 'javascript:void(0)')
|
a.set('href', 'javascript:void(0)')
|
||||||
parts = decode_url(href.split('|')[1])
|
parts = decode_url(href.split('|')[1])
|
||||||
lname, lfrag = parts[0], parts[1]
|
lname, lfrag = parts[0], parts[1]
|
||||||
ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
|
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
|
||||||
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
|
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
|
||||||
else:
|
else:
|
||||||
a.set('target', '_blank')
|
a.set('target', '_blank')
|
||||||
a.set('rel', 'noopener noreferrer')
|
a.set('rel', 'noopener noreferrer')
|
||||||
|
|
||||||
|
return name in changed
|
||||||
|
|
||||||
|
|
||||||
|
def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, container=None):
|
||||||
|
container = container or SimpleContainer(container_dir, opfpath, default_log)
|
||||||
|
link_to_map = {}
|
||||||
|
html_data = {}
|
||||||
|
virtualized_names = set()
|
||||||
|
for name in names:
|
||||||
|
if name is None:
|
||||||
|
continue
|
||||||
|
mt = container.mime_map[name].lower()
|
||||||
|
if mt in OEB_DOCS:
|
||||||
|
root = container.parsed(name)
|
||||||
|
html_data[name] = {
|
||||||
|
'length': get_length(root),
|
||||||
|
'has_maths': check_for_maths(root),
|
||||||
|
'anchor_map': anchor_map(root)
|
||||||
|
}
|
||||||
|
transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names)
|
||||||
|
elif mt in OEB_STYLES:
|
||||||
|
transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names)
|
||||||
elif mt == 'image/svg+xml':
|
elif mt == 'image/svg+xml':
|
||||||
self.virtualized_names.add(name)
|
transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names)
|
||||||
xlink = XLINK('href')
|
for v in itervalues(link_to_map):
|
||||||
altered = False
|
for k in v:
|
||||||
for elem in xlink_xpath(self.parsed(name)):
|
v[k] = tuple(v[k])
|
||||||
href = elem.get(xlink)
|
return link_to_map, html_data, tuple(virtualized_names)
|
||||||
if not href.startswith('#'):
|
|
||||||
elem.set(xlink, link_replacer(name, href))
|
|
||||||
altered = True
|
|
||||||
if altered:
|
|
||||||
changed.add(name)
|
|
||||||
|
|
||||||
tuple(map(self.dirty, changed))
|
|
||||||
|
|
||||||
def serialize_item(self, name):
|
def process_exploded_book(
|
||||||
mt = (self.mime_map[name] or '').lower()
|
book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False,
|
||||||
if mt in OEB_STYLES:
|
book_metadata=None, allow_no_cover=True, virtualize_resources=True
|
||||||
ans = ContainerBase.serialize_item(self, name).lstrip()
|
):
|
||||||
if not ans.startswith(b'@charset'):
|
log = log or default_log
|
||||||
ans = b'@charset "UTF-8";\n' + ans
|
container = SimpleContainer(tdir, opfpath, log)
|
||||||
|
input_plugin = plugin_for_input_format(input_fmt)
|
||||||
|
is_comic = bool(getattr(input_plugin, 'is_image_collection', False))
|
||||||
|
bookmark_data = None
|
||||||
|
if save_bookmark_data:
|
||||||
|
bm_file = 'META-INF/calibre_bookmarks.txt'
|
||||||
|
if container.exists(bm_file):
|
||||||
|
with container.open(bm_file, 'rb') as f:
|
||||||
|
bookmark_data = f.read()
|
||||||
|
|
||||||
|
# We do not add zero byte sized files as the IndexedDB API in the
|
||||||
|
# browser has no good way to distinguish between zero byte files and
|
||||||
|
# load failures.
|
||||||
|
excluded_names = {
|
||||||
|
name for name, mt in iteritems(container.mime_map) if
|
||||||
|
name == container.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or
|
||||||
|
name == 'mimetype' or not container.has_name_and_is_not_empty(name)}
|
||||||
|
raster_cover_name, titlepage_name = create_cover_page(container, input_fmt.lower(), allow_no_cover, book_metadata)
|
||||||
|
|
||||||
|
toc = get_toc(container, verify_destinations=False).to_dict(count())
|
||||||
|
if not toc or not toc.get('children'):
|
||||||
|
toc = from_xpaths(container, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count())
|
||||||
|
spine = [name for name, is_linear in container.spine_names]
|
||||||
|
spineq = frozenset(spine)
|
||||||
|
landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq]
|
||||||
|
|
||||||
|
book_render_data = {
|
||||||
|
'version': RENDER_VERSION,
|
||||||
|
'toc':toc,
|
||||||
|
'book_format': book_fmt,
|
||||||
|
'spine':spine,
|
||||||
|
'link_uid': uuid4(),
|
||||||
|
'book_hash': book_hash,
|
||||||
|
'is_comic': is_comic,
|
||||||
|
'raster_cover_name': raster_cover_name,
|
||||||
|
'title_page_name': titlepage_name,
|
||||||
|
'has_maths': False,
|
||||||
|
'total_length': 0,
|
||||||
|
'spine_length': 0,
|
||||||
|
'toc_anchor_map': toc_anchor_map(toc),
|
||||||
|
'landmarks': landmarks,
|
||||||
|
'link_to_map': {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def work_priority(name):
|
||||||
|
# ensure workers with large files or stylesheets
|
||||||
|
# have the less names
|
||||||
|
size = os.path.getsize(container.name_path_map[name]),
|
||||||
|
is_html = container.mime_map.get(name) in OEB_DOCS
|
||||||
|
return (0 if is_html else 1), size
|
||||||
|
|
||||||
|
names = sorted(
|
||||||
|
(n for n, mt in iteritems(container.mime_map) if mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml'),
|
||||||
|
key=work_priority)
|
||||||
|
|
||||||
|
results = render_manager(names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid']), container)
|
||||||
|
ltm = book_render_data['link_to_map']
|
||||||
|
html_data = {}
|
||||||
|
virtualized_names = set()
|
||||||
|
|
||||||
|
def merge_ltm(dest, src):
|
||||||
|
for k, v in iteritems(src):
|
||||||
|
if k in dest:
|
||||||
|
dest[k] |= v
|
||||||
|
else:
|
||||||
|
dest[k] = v
|
||||||
|
|
||||||
|
for link_to_map, hdata, vnames in results:
|
||||||
|
html_data.update(hdata)
|
||||||
|
virtualized_names |= set(vnames)
|
||||||
|
for k, v in iteritems(link_to_map):
|
||||||
|
for x in v:
|
||||||
|
v[x] = set(v[x])
|
||||||
|
if k in ltm:
|
||||||
|
merge_ltm(ltm[k], v)
|
||||||
|
else:
|
||||||
|
ltm[k] = v
|
||||||
|
|
||||||
|
def manifest_data(name):
|
||||||
|
mt = (container.mime_map.get(name) or 'application/octet-stream').lower()
|
||||||
|
ans = {
|
||||||
|
'size':os.path.getsize(container.name_path_map[name]),
|
||||||
|
'is_virtualized': name in virtualized_names,
|
||||||
|
'mimetype':mt,
|
||||||
|
'is_html': mt in OEB_DOCS,
|
||||||
|
}
|
||||||
|
if ans['is_html']:
|
||||||
|
data = html_data[name]
|
||||||
|
ans['length'] = l = data['length']
|
||||||
|
book_render_data['total_length'] += l
|
||||||
|
if name in book_render_data['spine']:
|
||||||
|
book_render_data['spine_length'] += l
|
||||||
|
ans['has_maths'] = hm = data['has_maths']
|
||||||
|
if hm:
|
||||||
|
book_render_data['has_maths'] = True
|
||||||
|
ans['anchor_map'] = data['anchor_map']
|
||||||
return ans
|
return ans
|
||||||
if mt not in OEB_DOCS:
|
|
||||||
return ContainerBase.serialize_item(self, name)
|
book_render_data['files'] = {name:manifest_data(name) for name in set(container.name_path_map) - excluded_names}
|
||||||
root = self.parsed(name)
|
container.commit()
|
||||||
return json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')).encode('utf-8')
|
|
||||||
|
for name in excluded_names:
|
||||||
|
os.remove(container.name_path_map[name])
|
||||||
|
|
||||||
|
ltm = book_render_data['link_to_map']
|
||||||
|
for name, amap in iteritems(ltm):
|
||||||
|
for k, v in tuple(iteritems(amap)):
|
||||||
|
amap[k] = tuple(v) # needed for JSON serialization
|
||||||
|
|
||||||
|
data = as_bytes(json.dumps(book_render_data, ensure_ascii=False))
|
||||||
|
with lopen(os.path.join(container.root, 'calibre-book-manifest.json'), 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
return container, bookmark_data
|
||||||
|
|
||||||
|
|
||||||
def split_name(name):
|
def split_name(name):
|
||||||
@ -566,22 +707,6 @@ def serialize_elem(elem, nsmap):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def ensure_head(root):
|
|
||||||
# Make sure we have only a single <head>
|
|
||||||
heads = list(root.iterchildren(XHTML('head')))
|
|
||||||
if len(heads) != 1:
|
|
||||||
if not heads:
|
|
||||||
root.insert(0, root.makeelement(XHTML('head')))
|
|
||||||
return root[0]
|
|
||||||
head = heads[0]
|
|
||||||
for eh in heads[1:]:
|
|
||||||
for child in eh.iterchildren('*'):
|
|
||||||
head.append(child)
|
|
||||||
extract(eh)
|
|
||||||
return head
|
|
||||||
return heads[0]
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_body(root):
|
def ensure_body(root):
|
||||||
# Make sure we have only a single <body>
|
# Make sure we have only a single <body>
|
||||||
bodies = list(root.iterchildren(XHTML('body')))
|
bodies = list(root.iterchildren(XHTML('body')))
|
||||||
@ -685,6 +810,7 @@ def get_stored_annotations(container):
|
|||||||
|
|
||||||
|
|
||||||
def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True):
|
def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True):
|
||||||
|
with RenderManager() as render_manager:
|
||||||
mi = None
|
mi = None
|
||||||
if serialize_metadata:
|
if serialize_metadata:
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
@ -692,9 +818,9 @@ def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, ex
|
|||||||
with lopen(pathtoebook, 'rb') as f, quick_metadata:
|
with lopen(pathtoebook, 'rb') as f, quick_metadata:
|
||||||
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
|
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
|
||||||
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
|
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
|
||||||
container = Container(
|
container, bookmark_data = process_exploded_book(
|
||||||
book_fmt, opfpath, input_fmt, output_dir, book_hash=book_hash,
|
book_fmt, opfpath, input_fmt, output_dir, render_manager,
|
||||||
save_bookmark_data=extract_annotations,
|
book_hash=book_hash, save_bookmark_data=extract_annotations,
|
||||||
book_metadata=mi, virtualize_resources=virtualize_resources
|
book_metadata=mi, virtualize_resources=virtualize_resources
|
||||||
)
|
)
|
||||||
if serialize_metadata:
|
if serialize_metadata:
|
||||||
@ -707,7 +833,7 @@ def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, ex
|
|||||||
f.write(json_dumps(d))
|
f.write(json_dumps(d))
|
||||||
if extract_annotations:
|
if extract_annotations:
|
||||||
annotations = None
|
annotations = None
|
||||||
if container.bookmark_data:
|
if bookmark_data:
|
||||||
annotations = json_dumps(tuple(get_stored_annotations(container)))
|
annotations = json_dumps(tuple(get_stored_annotations(container)))
|
||||||
if annotations:
|
if annotations:
|
||||||
with lopen(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
|
with lopen(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user