mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Forgot to exclude ruby tags from searching in the native viewer
Also fix ignore_text not recursing and add some performance improvements to the python function for extracting searchable text
This commit is contained in:
parent
f3b35c318f
commit
69cf7e684b
@ -224,21 +224,24 @@ class SearchResult:
|
||||
@lru_cache(maxsize=None)
|
||||
def searchable_text_for_name(name):
|
||||
ans = []
|
||||
add_text = ans.append
|
||||
serialized_data = json.loads(get_data(name)[0])
|
||||
stack = []
|
||||
a = stack.append
|
||||
removed_tails = []
|
||||
no_visit = frozenset({'script', 'style', 'title', 'head'})
|
||||
ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
|
||||
for child in serialized_data['tree']['c']:
|
||||
if child.get('n') == 'body':
|
||||
stack.append(child)
|
||||
a((child, False))
|
||||
# the JS code does not add the tail of body tags to flat text
|
||||
removed_tails.append((child.pop('l', None), child))
|
||||
ignore_text = {'script', 'style', 'title'}
|
||||
text_pos = 0
|
||||
anchor_offset_map = OrderedDict()
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
node, text_ignored_in_parent = stack.pop()
|
||||
if isinstance(node, str):
|
||||
ans.append(node)
|
||||
add_text(node)
|
||||
text_pos += len(node)
|
||||
continue
|
||||
g = node.get
|
||||
@ -253,13 +256,18 @@ def searchable_text_for_name(name):
|
||||
aid = x[1]
|
||||
if aid not in anchor_offset_map:
|
||||
anchor_offset_map[aid] = text_pos
|
||||
if name and text and name not in ignore_text:
|
||||
ans.append(text)
|
||||
if name in no_visit:
|
||||
continue
|
||||
ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
|
||||
|
||||
if text and not ignore_text_in_node_and_children:
|
||||
add_text(text)
|
||||
text_pos += len(text)
|
||||
if tail:
|
||||
stack.append(tail)
|
||||
if tail and not text_ignored_in_parent:
|
||||
a((tail, ignore_text_in_node_and_children))
|
||||
if children:
|
||||
stack.extend(reversed(children))
|
||||
for child in reversed(children):
|
||||
a((child, ignore_text_in_node_and_children))
|
||||
for (tail, body) in removed_tails:
|
||||
if tail is not None:
|
||||
body['l'] = tail
|
||||
|
@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map):
|
||||
anchor_offset_map[aid] = text_pos
|
||||
if no_visit[src.n]:
|
||||
continue
|
||||
ignore_text_in_node_and_children = v'!!ignore_text[src.n]'
|
||||
ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
|
||||
if not ignore_text_in_node_and_children and src.x:
|
||||
ans.push(src.x)
|
||||
text_pos += src.x.length
|
||||
|
Loading…
x
Reference in New Issue
Block a user