mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-03-10 11:53:41 -04:00
Also fix ignore_text not recursing and add some performance improvements to the python function for extracting searchable text
456 lines
16 KiB
Python
456 lines
16 KiB
Python
# vim:fileencoding=utf-8
|
|
# License: GPL v3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
|
from __python__ import hash_literals
|
|
|
|
from elementmaker import E
|
|
from encodings import base64decode, utf8_decode
|
|
|
|
from dom import clear, remove_all_attributes
|
|
from read_book.globals import runtime, ui_operations
|
|
from read_book.settings import opts
|
|
|
|
JSON_XHTML_MIMETYPE = 'application/calibre+xhtml+json'
|
|
|
|
def decode_component(x):
|
|
return utf8_decode(base64decode(x))
|
|
|
|
def decode_url(x):
|
|
parts = x.split('#', 1)
|
|
return decode_component(parts[0]), parts[1] or ''
|
|
|
|
def create_link_pat(book):
|
|
return RegExp(book.manifest.link_uid + r'\|([^|]+)\|', 'g')
|
|
|
|
def load_resources(book, root_name, previous_resources, proceed):
|
|
ans = Object.create(None)
|
|
pending_resources = v'[root_name]'
|
|
link_pat = create_link_pat(book)
|
|
|
|
def do_one():
|
|
name = pending_resources.shift()
|
|
if not name:
|
|
for k in previous_resources:
|
|
v'delete previous_resources[k]'
|
|
if book.manifest.files[root_name].has_maths:
|
|
return load_mathjax(book, ans, proceed)
|
|
return proceed(ans)
|
|
if ans[name]:
|
|
return setTimeout(do_one, 0)
|
|
if previous_resources[name]:
|
|
ans[name] = data = previous_resources[name]
|
|
if jstype(data[0]) is 'string':
|
|
find_virtualized_resources(data[0])
|
|
return setTimeout(do_one, 0)
|
|
ui_operations.get_file(book, name, got_one)
|
|
|
|
def got_one(data, name, mimetype):
|
|
ans[name] = v'[data, mimetype]'
|
|
if jstype(data) is 'string' and book.manifest.files[name]?.is_virtualized:
|
|
find_virtualized_resources(data)
|
|
return setTimeout(do_one, 0)
|
|
|
|
def find_virtualized_resources(text):
|
|
seen = set()
|
|
already_pending = {x.name for x in pending_resources}
|
|
link_pat.lastIndex = 0
|
|
while True:
|
|
m = link_pat.exec(text)
|
|
if not m:
|
|
break
|
|
name = decode_url(m[1])[0]
|
|
if name in seen or name in already_pending:
|
|
continue
|
|
seen.add(name)
|
|
pending_resources.push(name)
|
|
|
|
do_one()
|
|
|
|
mathjax_data = None
|
|
|
|
def load_mathjax(book, resource_data, proceed):
|
|
if mathjax_data is None:
|
|
ui_operations.get_mathjax_files(def(data):
|
|
nonlocal mathjax_data
|
|
mathjax_data = data
|
|
resource_data['..mathjax-files..'] = data
|
|
proceed(resource_data)
|
|
)
|
|
else:
|
|
resource_data['..mathjax-files..'] = mathjax_data
|
|
proceed(resource_data)
|
|
|
|
def finalize_resources(book, root_name, resource_data):
|
|
blob_url_map = Object.create(None)
|
|
root_data = None
|
|
link_pat = create_link_pat(book)
|
|
mathjax = resource_data['..mathjax-files..']
|
|
v'delete resource_data["..mathjax-files.."]'
|
|
|
|
# Resolve the non virtualized resources immediately
|
|
for name in resource_data:
|
|
data, mimetype = resource_data[name]
|
|
if jstype(data) is not 'string':
|
|
blob_url_map[name] = window.URL.createObjectURL(data)
|
|
for name in blob_url_map:
|
|
v'delete resource_data[name]'
|
|
|
|
def add_virtualized_resource(name, text, mimetype):
|
|
nonlocal root_data
|
|
if name is root_name:
|
|
root_data = JSON.parse(text)
|
|
else:
|
|
blob_url_map[name] = window.URL.createObjectURL(Blob([text], {'type': mimetype}))
|
|
|
|
def replace_deps(text):
|
|
replacements = v'[]'
|
|
unresolved_deps = set()
|
|
link_pat.lastIndex = 0
|
|
while True:
|
|
m = link_pat.exec(text)
|
|
if not m:
|
|
break
|
|
dname, frag = decode_url(m[1])
|
|
if blob_url_map[dname]:
|
|
rtext = blob_url_map[dname]
|
|
if frag:
|
|
rtext += '#' + frag
|
|
replacements.push(v'[m.index, m[0].length, rtext]')
|
|
else:
|
|
unresolved_deps.add(dname)
|
|
for index, sz, repl in reversed(replacements):
|
|
text = text[:index] + repl + text[index + sz:]
|
|
return unresolved_deps, text
|
|
|
|
unresolved_deps_map = {}
|
|
|
|
def has_unresolvable_deps(name):
|
|
deps = unresolved_deps_map[name]
|
|
if not deps or not deps.length:
|
|
return False
|
|
for x in deps:
|
|
if not blob_url_map[x]:
|
|
return True
|
|
return False
|
|
|
|
while True:
|
|
resolved = v'[]'
|
|
num = 0
|
|
for name in resource_data:
|
|
if not blob_url_map[name]:
|
|
num += 1
|
|
text, mimetype = resource_data[name]
|
|
if not has_unresolvable_deps(name):
|
|
unresolved_deps, text = replace_deps(text)
|
|
unresolved_deps_map[name] = unresolved_deps
|
|
if not unresolved_deps.length:
|
|
add_virtualized_resource(name, text, mimetype)
|
|
resolved.push(name)
|
|
if not num:
|
|
break
|
|
if not resolved.length:
|
|
unresolved = [name for name in resource_data if not blob_url_map[name]]
|
|
print('ERROR: Could not resolve all dependencies of {} because of a cyclic dependency. Remaining deps: {}'.format(root_name, unresolved))
|
|
# Add the items anyway, without resolving remaining deps
|
|
for name in resource_data:
|
|
if not blob_url_map[name]:
|
|
text, mimetype = resource_data[name]
|
|
text = replace_deps(text)[1]
|
|
add_virtualized_resource(name, text, mimetype)
|
|
break
|
|
for name in resolved:
|
|
v'delete resource_data[name]'
|
|
|
|
return root_data, mathjax, blob_url_map
|
|
|
|
js_types = {k: True for k in 'text/javascript text/ecmascript application/javascript application/ecmascript'.split(' ')}
|
|
resource_tag_names = {'script':'src', 'link':'href', 'img':'src', 'image':'xlink:href'}
|
|
ns_rmap = {'http://www.w3.org/2000/svg':'svg', 'http://www.w3.org/1999/xlink':'xlink', 'http://www.w3.org/1998/Math/MathML':'math', 'http://www.w3.org/XML/1998/namespace': 'xml', 'http://www.idpf.org/2007/ops': 'epub'}
|
|
ns_count = 0
|
|
hide_tooltips = False
|
|
|
|
def get_prefix(ns):
|
|
nonlocal ns_count
|
|
ans = ns_rmap[ns]
|
|
if not ans:
|
|
ns_rmap[ns] = ans = 'ns' + ns_count
|
|
ns_count += 1
|
|
return ans + ':'
|
|
|
|
def apply_attributes(src, elem, ns_map):
|
|
attributes = src.a
|
|
if not attributes:
|
|
return
|
|
for a in attributes:
|
|
if a[2]:
|
|
ns = ns_map[a[2]]
|
|
elem.setAttributeNS(ns, get_prefix(ns) + a[0], a[1])
|
|
else:
|
|
name = a[0]
|
|
if hide_tooltips and (name is 'title' or name is 'alt'):
|
|
continue
|
|
elem.setAttribute(name, a[1])
|
|
|
|
|
|
def is_loadable_link(attributes):
|
|
for a in attributes:
|
|
if a[0].toLowerCase() is 'rel' and a[1]:
|
|
for x in a[1].split(' '):
|
|
if x.toLowerCase() is 'stylesheet':
|
|
return True
|
|
return False
|
|
|
|
|
|
def process_stack(stack, tag_map, ns_map, load_required, onload):
|
|
while stack.length:
|
|
node, parent = stack.pop()
|
|
if tag_map:
|
|
tag_id = node[0]
|
|
src = tag_map[tag_id]
|
|
else:
|
|
src = node
|
|
tag_id = v'process_stack.tag_id++'
|
|
if src.s:
|
|
if src.n:
|
|
elem = document.createElementNS(ns_map[src.s], src.n)
|
|
else:
|
|
if src.l:
|
|
parent.appendChild(document.createTextNode(src.l))
|
|
continue
|
|
else:
|
|
elem = document.createElement(src.n)
|
|
loadable = False
|
|
attr = resource_tag_names[src.n]
|
|
if attr:
|
|
if attr.indexOf(':') != -1:
|
|
attr = attr.replace('xlink:', '')
|
|
if src.a:
|
|
for a in src.a:
|
|
if a[0] is attr:
|
|
loadable = is_loadable_link(src.a) if src.n is 'link' else True
|
|
break
|
|
if loadable:
|
|
load_required.add(tag_id)
|
|
load_callback = onload.bind(tag_id)
|
|
elem.addEventListener('load', load_callback)
|
|
elem.addEventListener('error', load_callback)
|
|
|
|
apply_attributes(src, elem, ns_map)
|
|
parent.appendChild(elem)
|
|
if src.x:
|
|
if src.n is 'script' and js_types[(elem.getAttribute('type') or 'text/javascript').toLowerCase()] is True:
|
|
elem.text = src.x
|
|
else:
|
|
elem.appendChild(document.createTextNode(src.x))
|
|
if src.l:
|
|
parent.appendChild(document.createTextNode(src.l))
|
|
if tag_map:
|
|
for v'var i = node.length - 1; i >= 1; i--': # noqa: unused-local
|
|
stack.push(v'[node[i], elem]')
|
|
elif node.c:
|
|
for v'var i = node.c.length; i-- > 0;': # noqa: unused-local
|
|
stack.push(v'[node.c[i], elem]')
|
|
|
|
|
|
def unserialize_html(serialized_data, proceed, postprocess_dom, root_name):
|
|
nonlocal hide_tooltips
|
|
hide_tooltips = opts.hide_tooltips
|
|
if serialized_data.tag_map:
|
|
return unserialize_html_legacy(serialized_data, proceed, postprocess_dom, root_name)
|
|
html = serialized_data.tree
|
|
ns_map = serialized_data.ns_map
|
|
remove_all_attributes(document.documentElement, document.head, document.body)
|
|
clear(document.head, document.body)
|
|
apply_attributes(html, document.documentElement, ns_map)
|
|
# hide browser scrollbars while loading since they will anyway be hidden
|
|
# after loading and this prevents extra layouts
|
|
document.head.appendChild(
|
|
E.style(type='text/css', 'html::-webkit-scrollbar, body::-webkit-scrollbar { display: none !important }')
|
|
)
|
|
if runtime.is_standalone_viewer and root_name:
|
|
if root_name.indexOf('/') > -1:
|
|
base = window.location.pathname.rpartition('/')[0]
|
|
base = f'{window.location.protocol}//{window.location.hostname}{base}/' + root_name
|
|
document.head.appendChild(E.base(href=base))
|
|
|
|
|
|
# Default stylesheet
|
|
if not runtime.is_standalone_viewer:
|
|
# for the standalone viewer the default font family is set
|
|
# in the viewer settings
|
|
document.head.appendChild(E.style(type='text/css', 'html {{ font-family: {} }}'.format(window.default_font_family or "sans-serif")))
|
|
|
|
load_required = set()
|
|
proceeded = False
|
|
hang_timeout = 5
|
|
|
|
def hangcheck():
|
|
nonlocal proceeded
|
|
if not proceeded:
|
|
proceeded = True
|
|
print(f'WARNING: All resources did not load in {hang_timeout} seconds, proceeding anyway ({load_required.length} resources left)')
|
|
proceed()
|
|
|
|
def onload():
|
|
nonlocal proceeded
|
|
load_required.discard(this)
|
|
if not load_required.length and not proceeded:
|
|
proceeded = True
|
|
proceed()
|
|
|
|
def process_children(node, parent):
|
|
if not node.c:
|
|
return
|
|
stack = v'[]'
|
|
for v'var i = node.c.length; i-- > 0;': # noqa: unused-local
|
|
child = v'node.c[i]'
|
|
if child.n is not 'meta' and child.n is not 'base':
|
|
stack.push(v'[child, parent]')
|
|
process_stack(stack, None, ns_map, load_required, onload)
|
|
|
|
|
|
body_done = False
|
|
process_stack.tag_id = 1
|
|
for child in html.c:
|
|
if child.n is 'head':
|
|
process_children(child, document.head)
|
|
elif child.n is 'body':
|
|
if not document.body:
|
|
document.documentElement.appendChild(document.createElement('body'))
|
|
if not body_done:
|
|
body_done = True
|
|
apply_attributes(child, document.body, ns_map)
|
|
if child.x:
|
|
document.body.appendChild(document.createTextNode(child.x))
|
|
process_children(child, document.body)
|
|
|
|
if postprocess_dom:
|
|
postprocess_dom()
|
|
ev = document.createEvent('Event')
|
|
ev.initEvent('DOMContentLoaded', True, True)
|
|
document.dispatchEvent(ev)
|
|
if load_required.length:
|
|
setTimeout(hangcheck, hang_timeout * 1000)
|
|
else:
|
|
proceeded = True
|
|
proceed()
|
|
|
|
|
|
|
|
def unserialize_html_legacy(serialized_data, proceed, postprocess_dom, root_name):
|
|
tag_map = serialized_data.tag_map
|
|
tree = serialized_data.tree
|
|
ns_map = serialized_data.ns_map
|
|
html = tag_map[0]
|
|
remove_all_attributes(document.documentElement)
|
|
apply_attributes(html, document.documentElement, ns_map)
|
|
head, body = tree[1], tree[2] # noqa: unused-local
|
|
clear(document.head, document.body)
|
|
remove_all_attributes(document.head, document.body)
|
|
# hide browser scrollbars while loading since they will anyway be hidden
|
|
# after loading and this prevents extra layouts
|
|
document.head.appendChild(
|
|
E.style(type='text/css', 'html::-webkit-scrollbar, body::-webkit-scrollbar { display: none !important }')
|
|
)
|
|
if runtime.is_standalone_viewer and root_name:
|
|
if root_name.indexOf('/') > -1:
|
|
base = window.location.pathname.rpartition('/')[0]
|
|
base = f'{window.location.protocol}//{window.location.hostname}{base}/' + root_name
|
|
document.head.appendChild(E.base(href=base))
|
|
|
|
|
|
# Default stylesheet
|
|
if not runtime.is_standalone_viewer:
|
|
# for the standalone viewer the default font family is set
|
|
# in the viewer settings
|
|
document.head.appendChild(E.style(type='text/css', 'html {{ font-family: {} }}'.format(window.default_font_family or "sans-serif")))
|
|
load_required = set()
|
|
proceeded = False
|
|
hang_timeout = 5
|
|
|
|
def hangcheck():
|
|
nonlocal proceeded
|
|
if not proceeded:
|
|
proceeded = True
|
|
print(f'WARNING: All resources did not load in {hang_timeout} seconds, proceeding anyway ({load_required.length} resources left)')
|
|
proceed()
|
|
|
|
def onload():
|
|
nonlocal proceeded
|
|
load_required.discard(this)
|
|
if not load_required.length and not proceeded:
|
|
proceeded = True
|
|
proceed()
|
|
|
|
stack = v'[]'
|
|
for v'var i = head.length - 1; i >= 1; i--':
|
|
stack.push(v'[head[i], document.head]')
|
|
process_stack(stack, tag_map, ns_map, load_required, onload)
|
|
bnode = tag_map[body[0]]
|
|
apply_attributes(bnode, document.body, ns_map)
|
|
if bnode.x:
|
|
document.body.appendChild(document.createTextNode(bnode.x))
|
|
for v'var i = body.length - 1; i >= 1; i--': # noqa: unused-local
|
|
stack.push(v'[body[i], document.body]')
|
|
process_stack(stack, tag_map, ns_map, load_required, onload)
|
|
if postprocess_dom:
|
|
postprocess_dom()
|
|
ev = document.createEvent('Event')
|
|
ev.initEvent('DOMContentLoaded', True, True)
|
|
document.dispatchEvent(ev)
|
|
if load_required.length:
|
|
setTimeout(hangcheck, hang_timeout * 1000)
|
|
else:
|
|
proceeded = True
|
|
proceed()
|
|
|
|
|
|
def text_from_serialized_html(data, get_anchor_offset_map):
|
|
serialized_data = JSON.parse(data)
|
|
tag_map = serialized_data.tag_map
|
|
ans = v'[]'
|
|
no_visit = {'script': True, 'style': True, 'title': True, 'head': True}
|
|
ignore_text = {'img': True, 'math': True, 'rt': True, 'rp': True, 'rtc': True}
|
|
ignore_text
|
|
if tag_map:
|
|
stack = v'[[serialized_data.tree[2], false]]'
|
|
else:
|
|
stack = v'[]'
|
|
for child in serialized_data.tree.c:
|
|
if child.n is 'body':
|
|
stack.push(v'[child, false]')
|
|
anchor_offset_map = {}
|
|
text_pos = 0
|
|
while stack.length:
|
|
node, text_ignored_in_parent = stack.pop()
|
|
if jstype(node) is 'string':
|
|
ans.push(node)
|
|
text_pos += node.length
|
|
continue
|
|
src = tag_map[node[0]] if tag_map else node
|
|
if get_anchor_offset_map and src.a:
|
|
for v'var i = 0; i < src.a.length; i++':
|
|
x = src.a[i]
|
|
if x[0] is 'id':
|
|
aid = x[1]
|
|
if jstype(anchor_offset_map[aid]) is not 'number':
|
|
anchor_offset_map[aid] = text_pos
|
|
if no_visit[src.n]:
|
|
continue
|
|
ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
|
|
if not ignore_text_in_node_and_children and src.x:
|
|
ans.push(src.x)
|
|
text_pos += src.x.length
|
|
if not text_ignored_in_parent and src.l:
|
|
stack.push(v'[src.l, ignore_text_in_node_and_children]')
|
|
if tag_map:
|
|
for v'var i = node.length - 1; i >= 1; i--':
|
|
stack.push(v'[node[i], ignore_text_in_node_and_children]')
|
|
else:
|
|
if src.c:
|
|
for v'var i = src.c.length; i-- > 0;':
|
|
stack.push(v'[src.c[i], ignore_text_in_node_and_children]')
|
|
ans = ans.join('')
|
|
if get_anchor_offset_map:
|
|
return ans, anchor_offset_map
|
|
return ans
|