From 69cf7e684b196379499c5ac5ef171f719860b325 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 25 May 2024 10:23:44 +0530 Subject: [PATCH] Forgot to exclude ruby tags from searching in the native viewer Also fix ignore_text not recursing and add some performance improvements to the python function for extracting searchable text --- src/calibre/gui2/viewer/search.py | 26 +++++++++++++++++--------- src/pyj/read_book/resources.pyj | 2 +- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/calibre/gui2/viewer/search.py b/src/calibre/gui2/viewer/search.py index a6bc791c69..91bac79a3a 100644 --- a/src/calibre/gui2/viewer/search.py +++ b/src/calibre/gui2/viewer/search.py @@ -224,21 +224,24 @@ class SearchResult: @lru_cache(maxsize=None) def searchable_text_for_name(name): ans = [] + add_text = ans.append serialized_data = json.loads(get_data(name)[0]) stack = [] + a = stack.append removed_tails = [] + no_visit = frozenset({'script', 'style', 'title', 'head'}) + ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'}) for child in serialized_data['tree']['c']: if child.get('n') == 'body': - stack.append(child) + a((child, False)) # the JS code does not add the tail of body tags to flat text removed_tails.append((child.pop('l', None), child)) - ignore_text = {'script', 'style', 'title'} text_pos = 0 anchor_offset_map = OrderedDict() while stack: - node = stack.pop() + node, text_ignored_in_parent = stack.pop() if isinstance(node, str): - ans.append(node) + add_text(node) text_pos += len(node) continue g = node.get @@ -253,13 +256,18 @@ def searchable_text_for_name(name): aid = x[1] if aid not in anchor_offset_map: anchor_offset_map[aid] = text_pos - if name and text and name not in ignore_text: - ans.append(text) + if name in no_visit: + continue + ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text + + if text and not ignore_text_in_node_and_children: + add_text(text) text_pos += len(text) - if tail: - stack.append(tail) + if tail and not text_ignored_in_parent: + a((tail, ignore_text_in_node_and_children)) if children: - stack.extend(reversed(children)) + for child in reversed(children): + a((child, ignore_text_in_node_and_children)) for (tail, body) in removed_tails: if tail is not None: body['l'] = tail diff --git a/src/pyj/read_book/resources.pyj b/src/pyj/read_book/resources.pyj index 8d173948f5..cbde88a6f3 100644 --- a/src/pyj/read_book/resources.pyj +++ b/src/pyj/read_book/resources.pyj @@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map): anchor_offset_map[aid] = text_pos if no_visit[src.n]: continue - ignore_text_in_node_and_children = v'!!ignore_text[src.n]' + ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]' if not ignore_text_in_node_and_children and src.x: ans.push(src.x) text_pos += src.x.length