calibre/src/pyj/read_book/resources.pyj
Kovid Goyal 69cf7e684b
Forgot to exclude ruby tags from searching in the native viewer
Also fix ignore_text not recursing and add some performance improvements
to the python function for extracting searchable text
2024-05-25 10:23:44 +05:30

456 lines
16 KiB
Python

# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __python__ import hash_literals
from elementmaker import E
from encodings import base64decode, utf8_decode
from dom import clear, remove_all_attributes
from read_book.globals import runtime, ui_operations
from read_book.settings import opts
JSON_XHTML_MIMETYPE = 'application/calibre+xhtml+json'
def decode_component(x):
return utf8_decode(base64decode(x))
def decode_url(x):
parts = x.split('#', 1)
return decode_component(parts[0]), parts[1] or ''
def create_link_pat(book):
return RegExp(book.manifest.link_uid + r'\|([^|]+)\|', 'g')
def load_resources(book, root_name, previous_resources, proceed):
ans = Object.create(None)
pending_resources = v'[root_name]'
link_pat = create_link_pat(book)
def do_one():
name = pending_resources.shift()
if not name:
for k in previous_resources:
v'delete previous_resources[k]'
if book.manifest.files[root_name].has_maths:
return load_mathjax(book, ans, proceed)
return proceed(ans)
if ans[name]:
return setTimeout(do_one, 0)
if previous_resources[name]:
ans[name] = data = previous_resources[name]
if jstype(data[0]) is 'string':
find_virtualized_resources(data[0])
return setTimeout(do_one, 0)
ui_operations.get_file(book, name, got_one)
def got_one(data, name, mimetype):
ans[name] = v'[data, mimetype]'
if jstype(data) is 'string' and book.manifest.files[name]?.is_virtualized:
find_virtualized_resources(data)
return setTimeout(do_one, 0)
def find_virtualized_resources(text):
seen = set()
already_pending = {x.name for x in pending_resources}
link_pat.lastIndex = 0
while True:
m = link_pat.exec(text)
if not m:
break
name = decode_url(m[1])[0]
if name in seen or name in already_pending:
continue
seen.add(name)
pending_resources.push(name)
do_one()
mathjax_data = None
def load_mathjax(book, resource_data, proceed):
if mathjax_data is None:
ui_operations.get_mathjax_files(def(data):
nonlocal mathjax_data
mathjax_data = data
resource_data['..mathjax-files..'] = data
proceed(resource_data)
)
else:
resource_data['..mathjax-files..'] = mathjax_data
proceed(resource_data)
def finalize_resources(book, root_name, resource_data):
blob_url_map = Object.create(None)
root_data = None
link_pat = create_link_pat(book)
mathjax = resource_data['..mathjax-files..']
v'delete resource_data["..mathjax-files.."]'
# Resolve the non virtualized resources immediately
for name in resource_data:
data, mimetype = resource_data[name]
if jstype(data) is not 'string':
blob_url_map[name] = window.URL.createObjectURL(data)
for name in blob_url_map:
v'delete resource_data[name]'
def add_virtualized_resource(name, text, mimetype):
nonlocal root_data
if name is root_name:
root_data = JSON.parse(text)
else:
blob_url_map[name] = window.URL.createObjectURL(Blob([text], {'type': mimetype}))
def replace_deps(text):
replacements = v'[]'
unresolved_deps = set()
link_pat.lastIndex = 0
while True:
m = link_pat.exec(text)
if not m:
break
dname, frag = decode_url(m[1])
if blob_url_map[dname]:
rtext = blob_url_map[dname]
if frag:
rtext += '#' + frag
replacements.push(v'[m.index, m[0].length, rtext]')
else:
unresolved_deps.add(dname)
for index, sz, repl in reversed(replacements):
text = text[:index] + repl + text[index + sz:]
return unresolved_deps, text
unresolved_deps_map = {}
def has_unresolvable_deps(name):
deps = unresolved_deps_map[name]
if not deps or not deps.length:
return False
for x in deps:
if not blob_url_map[x]:
return True
return False
while True:
resolved = v'[]'
num = 0
for name in resource_data:
if not blob_url_map[name]:
num += 1
text, mimetype = resource_data[name]
if not has_unresolvable_deps(name):
unresolved_deps, text = replace_deps(text)
unresolved_deps_map[name] = unresolved_deps
if not unresolved_deps.length:
add_virtualized_resource(name, text, mimetype)
resolved.push(name)
if not num:
break
if not resolved.length:
unresolved = [name for name in resource_data if not blob_url_map[name]]
print('ERROR: Could not resolve all dependencies of {} because of a cyclic dependency. Remaining deps: {}'.format(root_name, unresolved))
# Add the items anyway, without resolving remaining deps
for name in resource_data:
if not blob_url_map[name]:
text, mimetype = resource_data[name]
text = replace_deps(text)[1]
add_virtualized_resource(name, text, mimetype)
break
for name in resolved:
v'delete resource_data[name]'
return root_data, mathjax, blob_url_map
js_types = {k: True for k in 'text/javascript text/ecmascript application/javascript application/ecmascript'.split(' ')}
resource_tag_names = {'script':'src', 'link':'href', 'img':'src', 'image':'xlink:href'}
ns_rmap = {'http://www.w3.org/2000/svg':'svg', 'http://www.w3.org/1999/xlink':'xlink', 'http://www.w3.org/1998/Math/MathML':'math', 'http://www.w3.org/XML/1998/namespace': 'xml', 'http://www.idpf.org/2007/ops': 'epub'}
ns_count = 0
hide_tooltips = False
def get_prefix(ns):
nonlocal ns_count
ans = ns_rmap[ns]
if not ans:
ns_rmap[ns] = ans = 'ns' + ns_count
ns_count += 1
return ans + ':'
def apply_attributes(src, elem, ns_map):
attributes = src.a
if not attributes:
return
for a in attributes:
if a[2]:
ns = ns_map[a[2]]
elem.setAttributeNS(ns, get_prefix(ns) + a[0], a[1])
else:
name = a[0]
if hide_tooltips and (name is 'title' or name is 'alt'):
continue
elem.setAttribute(name, a[1])
def is_loadable_link(attributes):
for a in attributes:
if a[0].toLowerCase() is 'rel' and a[1]:
for x in a[1].split(' '):
if x.toLowerCase() is 'stylesheet':
return True
return False
def process_stack(stack, tag_map, ns_map, load_required, onload):
while stack.length:
node, parent = stack.pop()
if tag_map:
tag_id = node[0]
src = tag_map[tag_id]
else:
src = node
tag_id = v'process_stack.tag_id++'
if src.s:
if src.n:
elem = document.createElementNS(ns_map[src.s], src.n)
else:
if src.l:
parent.appendChild(document.createTextNode(src.l))
continue
else:
elem = document.createElement(src.n)
loadable = False
attr = resource_tag_names[src.n]
if attr:
if attr.indexOf(':') != -1:
attr = attr.replace('xlink:', '')
if src.a:
for a in src.a:
if a[0] is attr:
loadable = is_loadable_link(src.a) if src.n is 'link' else True
break
if loadable:
load_required.add(tag_id)
load_callback = onload.bind(tag_id)
elem.addEventListener('load', load_callback)
elem.addEventListener('error', load_callback)
apply_attributes(src, elem, ns_map)
parent.appendChild(elem)
if src.x:
if src.n is 'script' and js_types[(elem.getAttribute('type') or 'text/javascript').toLowerCase()] is True:
elem.text = src.x
else:
elem.appendChild(document.createTextNode(src.x))
if src.l:
parent.appendChild(document.createTextNode(src.l))
if tag_map:
for v'var i = node.length - 1; i >= 1; i--': # noqa: unused-local
stack.push(v'[node[i], elem]')
elif node.c:
for v'var i = node.c.length; i-- > 0;': # noqa: unused-local
stack.push(v'[node.c[i], elem]')
def unserialize_html(serialized_data, proceed, postprocess_dom, root_name):
nonlocal hide_tooltips
hide_tooltips = opts.hide_tooltips
if serialized_data.tag_map:
return unserialize_html_legacy(serialized_data, proceed, postprocess_dom, root_name)
html = serialized_data.tree
ns_map = serialized_data.ns_map
remove_all_attributes(document.documentElement, document.head, document.body)
clear(document.head, document.body)
apply_attributes(html, document.documentElement, ns_map)
# hide browser scrollbars while loading since they will anyway be hidden
# after loading and this prevents extra layouts
document.head.appendChild(
E.style(type='text/css', 'html::-webkit-scrollbar, body::-webkit-scrollbar { display: none !important }')
)
if runtime.is_standalone_viewer and root_name:
if root_name.indexOf('/') > -1:
base = window.location.pathname.rpartition('/')[0]
base = f'{window.location.protocol}//{window.location.hostname}{base}/' + root_name
document.head.appendChild(E.base(href=base))
# Default stylesheet
if not runtime.is_standalone_viewer:
# for the standalone viewer the default font family is set
# in the viewer settings
document.head.appendChild(E.style(type='text/css', 'html {{ font-family: {} }}'.format(window.default_font_family or "sans-serif")))
load_required = set()
proceeded = False
hang_timeout = 5
def hangcheck():
nonlocal proceeded
if not proceeded:
proceeded = True
print(f'WARNING: All resources did not load in {hang_timeout} seconds, proceeding anyway ({load_required.length} resources left)')
proceed()
def onload():
nonlocal proceeded
load_required.discard(this)
if not load_required.length and not proceeded:
proceeded = True
proceed()
def process_children(node, parent):
if not node.c:
return
stack = v'[]'
for v'var i = node.c.length; i-- > 0;': # noqa: unused-local
child = v'node.c[i]'
if child.n is not 'meta' and child.n is not 'base':
stack.push(v'[child, parent]')
process_stack(stack, None, ns_map, load_required, onload)
body_done = False
process_stack.tag_id = 1
for child in html.c:
if child.n is 'head':
process_children(child, document.head)
elif child.n is 'body':
if not document.body:
document.documentElement.appendChild(document.createElement('body'))
if not body_done:
body_done = True
apply_attributes(child, document.body, ns_map)
if child.x:
document.body.appendChild(document.createTextNode(child.x))
process_children(child, document.body)
if postprocess_dom:
postprocess_dom()
ev = document.createEvent('Event')
ev.initEvent('DOMContentLoaded', True, True)
document.dispatchEvent(ev)
if load_required.length:
setTimeout(hangcheck, hang_timeout * 1000)
else:
proceeded = True
proceed()
def unserialize_html_legacy(serialized_data, proceed, postprocess_dom, root_name):
tag_map = serialized_data.tag_map
tree = serialized_data.tree
ns_map = serialized_data.ns_map
html = tag_map[0]
remove_all_attributes(document.documentElement)
apply_attributes(html, document.documentElement, ns_map)
head, body = tree[1], tree[2] # noqa: unused-local
clear(document.head, document.body)
remove_all_attributes(document.head, document.body)
# hide browser scrollbars while loading since they will anyway be hidden
# after loading and this prevents extra layouts
document.head.appendChild(
E.style(type='text/css', 'html::-webkit-scrollbar, body::-webkit-scrollbar { display: none !important }')
)
if runtime.is_standalone_viewer and root_name:
if root_name.indexOf('/') > -1:
base = window.location.pathname.rpartition('/')[0]
base = f'{window.location.protocol}//{window.location.hostname}{base}/' + root_name
document.head.appendChild(E.base(href=base))
# Default stylesheet
if not runtime.is_standalone_viewer:
# for the standalone viewer the default font family is set
# in the viewer settings
document.head.appendChild(E.style(type='text/css', 'html {{ font-family: {} }}'.format(window.default_font_family or "sans-serif")))
load_required = set()
proceeded = False
hang_timeout = 5
def hangcheck():
nonlocal proceeded
if not proceeded:
proceeded = True
print(f'WARNING: All resources did not load in {hang_timeout} seconds, proceeding anyway ({load_required.length} resources left)')
proceed()
def onload():
nonlocal proceeded
load_required.discard(this)
if not load_required.length and not proceeded:
proceeded = True
proceed()
stack = v'[]'
for v'var i = head.length - 1; i >= 1; i--':
stack.push(v'[head[i], document.head]')
process_stack(stack, tag_map, ns_map, load_required, onload)
bnode = tag_map[body[0]]
apply_attributes(bnode, document.body, ns_map)
if bnode.x:
document.body.appendChild(document.createTextNode(bnode.x))
for v'var i = body.length - 1; i >= 1; i--': # noqa: unused-local
stack.push(v'[body[i], document.body]')
process_stack(stack, tag_map, ns_map, load_required, onload)
if postprocess_dom:
postprocess_dom()
ev = document.createEvent('Event')
ev.initEvent('DOMContentLoaded', True, True)
document.dispatchEvent(ev)
if load_required.length:
setTimeout(hangcheck, hang_timeout * 1000)
else:
proceeded = True
proceed()
def text_from_serialized_html(data, get_anchor_offset_map):
serialized_data = JSON.parse(data)
tag_map = serialized_data.tag_map
ans = v'[]'
no_visit = {'script': True, 'style': True, 'title': True, 'head': True}
ignore_text = {'img': True, 'math': True, 'rt': True, 'rp': True, 'rtc': True}
ignore_text
if tag_map:
stack = v'[[serialized_data.tree[2], false]]'
else:
stack = v'[]'
for child in serialized_data.tree.c:
if child.n is 'body':
stack.push(v'[child, false]')
anchor_offset_map = {}
text_pos = 0
while stack.length:
node, text_ignored_in_parent = stack.pop()
if jstype(node) is 'string':
ans.push(node)
text_pos += node.length
continue
src = tag_map[node[0]] if tag_map else node
if get_anchor_offset_map and src.a:
for v'var i = 0; i < src.a.length; i++':
x = src.a[i]
if x[0] is 'id':
aid = x[1]
if jstype(anchor_offset_map[aid]) is not 'number':
anchor_offset_map[aid] = text_pos
if no_visit[src.n]:
continue
ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
if not ignore_text_in_node_and_children and src.x:
ans.push(src.x)
text_pos += src.x.length
if not text_ignored_in_parent and src.l:
stack.push(v'[src.l, ignore_text_in_node_and_children]')
if tag_map:
for v'var i = node.length - 1; i >= 1; i--':
stack.push(v'[node[i], ignore_text_in_node_and_children]')
else:
if src.c:
for v'var i = src.c.length; i-- > 0;':
stack.push(v'[src.c[i], ignore_text_in_node_and_children]')
ans = ans.join('')
if get_anchor_offset_map:
return ans, anchor_offset_map
return ans