Forgot to exclude ruby tags from searching in the native viewer

Also fix ignore_text not recursing and add some performance improvements
to the python function for extracting searchable text
This commit is contained in:
Kovid Goyal 2024-05-25 10:23:44 +05:30
parent f3b35c318f
commit 69cf7e684b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 18 additions and 10 deletions

View File

@ -224,21 +224,24 @@ class SearchResult:
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def searchable_text_for_name(name): def searchable_text_for_name(name):
ans = [] ans = []
add_text = ans.append
serialized_data = json.loads(get_data(name)[0]) serialized_data = json.loads(get_data(name)[0])
stack = [] stack = []
a = stack.append
removed_tails = [] removed_tails = []
no_visit = frozenset({'script', 'style', 'title', 'head'})
ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
for child in serialized_data['tree']['c']: for child in serialized_data['tree']['c']:
if child.get('n') == 'body': if child.get('n') == 'body':
stack.append(child) a((child, False))
# the JS code does not add the tail of body tags to flat text # the JS code does not add the tail of body tags to flat text
removed_tails.append((child.pop('l', None), child)) removed_tails.append((child.pop('l', None), child))
ignore_text = {'script', 'style', 'title'}
text_pos = 0 text_pos = 0
anchor_offset_map = OrderedDict() anchor_offset_map = OrderedDict()
while stack: while stack:
node = stack.pop() node, text_ignored_in_parent = stack.pop()
if isinstance(node, str): if isinstance(node, str):
ans.append(node) add_text(node)
text_pos += len(node) text_pos += len(node)
continue continue
g = node.get g = node.get
@ -253,13 +256,18 @@ def searchable_text_for_name(name):
aid = x[1] aid = x[1]
if aid not in anchor_offset_map: if aid not in anchor_offset_map:
anchor_offset_map[aid] = text_pos anchor_offset_map[aid] = text_pos
if name and text and name not in ignore_text: if name in no_visit:
ans.append(text) continue
ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
if text and not ignore_text_in_node_and_children:
add_text(text)
text_pos += len(text) text_pos += len(text)
if tail: if tail and not text_ignored_in_parent:
stack.append(tail) a((tail, ignore_text_in_node_and_children))
if children: if children:
stack.extend(reversed(children)) for child in reversed(children):
a((child, ignore_text_in_node_and_children))
for (tail, body) in removed_tails: for (tail, body) in removed_tails:
if tail is not None: if tail is not None:
body['l'] = tail body['l'] = tail

View File

@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map):
anchor_offset_map[aid] = text_pos anchor_offset_map[aid] = text_pos
if no_visit[src.n]: if no_visit[src.n]:
continue continue
ignore_text_in_node_and_children = v'!!ignore_text[src.n]' ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
if not ignore_text_in_node_and_children and src.x: if not ignore_text_in_node_and_children and src.x:
ans.push(src.x) ans.push(src.x)
text_pos += src.x.length text_pos += src.x.length