Forgot to exclude ruby tags from searching in the native viewer

Also fix ignore_text not recursing and add some performance improvements
to the python function for extracting searchable text
This commit is contained in:
Kovid Goyal 2024-05-25 10:23:44 +05:30
parent f3b35c318f
commit 69cf7e684b
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 18 additions and 10 deletions

View File

@ -224,21 +224,24 @@ class SearchResult:
@lru_cache(maxsize=None)
def searchable_text_for_name(name):
ans = []
add_text = ans.append
serialized_data = json.loads(get_data(name)[0])
stack = []
a = stack.append
removed_tails = []
no_visit = frozenset({'script', 'style', 'title', 'head'})
ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
for child in serialized_data['tree']['c']:
if child.get('n') == 'body':
stack.append(child)
a((child, False))
# the JS code does not add the tail of body tags to flat text
removed_tails.append((child.pop('l', None), child))
ignore_text = {'script', 'style', 'title'}
text_pos = 0
anchor_offset_map = OrderedDict()
while stack:
node = stack.pop()
node, text_ignored_in_parent = stack.pop()
if isinstance(node, str):
ans.append(node)
add_text(node)
text_pos += len(node)
continue
g = node.get
@ -253,13 +256,18 @@ def searchable_text_for_name(name):
aid = x[1]
if aid not in anchor_offset_map:
anchor_offset_map[aid] = text_pos
if name and text and name not in ignore_text:
ans.append(text)
if name in no_visit:
continue
ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
if text and not ignore_text_in_node_and_children:
add_text(text)
text_pos += len(text)
if tail:
stack.append(tail)
if tail and not text_ignored_in_parent:
a((tail, ignore_text_in_node_and_children))
if children:
stack.extend(reversed(children))
for child in reversed(children):
a((child, ignore_text_in_node_and_children))
for (tail, body) in removed_tails:
if tail is not None:
body['l'] = tail

View File

@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map):
anchor_offset_map[aid] = text_pos
if no_visit[src.n]:
continue
ignore_text_in_node_and_children = v'!!ignore_text[src.n]'
ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
if not ignore_text_in_node_and_children and src.x:
ans.push(src.x)
text_pos += src.x.length