Forgot to exclude ruby tags from searching in the native viewer

Also fix ignore_text not recursing and add some performance improvements to the python function for extracting searchable text
2025-07-09 03:04:10 -04:00 · 2024-05-25 10:23:44 +05:30 · 2024-05-25 10:23:44 +05:30 · 69cf7e684b
commit 69cf7e684b
parent f3b35c318f
2 changed files with 18 additions and 10 deletions
--- a/src/calibre/gui2/viewer/search.py
+++ b/src/calibre/gui2/viewer/search.py
@ -224,21 +224,24 @@ class SearchResult:
@lru_cache(maxsize=None)
 def searchable_text_for_name(name):
    ans = []
+    add_text = ans.append
    serialized_data = json.loads(get_data(name)[0])
    stack = []
+    a = stack.append
    removed_tails = []
+    no_visit = frozenset({'script', 'style', 'title', 'head'})
+    ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
    for child in serialized_data['tree']['c']:
        if child.get('n') == 'body':
-            stack.append(child)
+            a((child, False))
            # the JS code does not add the tail of body tags to flat text
            removed_tails.append((child.pop('l', None), child))
-    ignore_text = {'script', 'style', 'title'}
    text_pos = 0
    anchor_offset_map = OrderedDict()
    while stack:
-        node = stack.pop()
+        node, text_ignored_in_parent = stack.pop()
        if isinstance(node, str):
-            ans.append(node)
+            add_text(node)
            text_pos += len(node)
            continue
        g = node.get
@ -253,13 +256,18 @@ def searchable_text_for_name(name):
                    aid = x[1]
                    if aid not in anchor_offset_map:
                        anchor_offset_map[aid] = text_pos
-        if name and text and name not in ignore_text:
-            ans.append(text)
+        if name in no_visit:
+            continue
+        ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
+
+        if text and not ignore_text_in_node_and_children:
+            add_text(text)
            text_pos += len(text)
-        if tail:
-            stack.append(tail)
+        if tail and not text_ignored_in_parent:
+            a((tail, ignore_text_in_node_and_children))
        if children:
-            stack.extend(reversed(children))
+            for child in reversed(children):
+                a((child, ignore_text_in_node_and_children))
    for (tail, body) in removed_tails:
        if tail is not None:
            body['l'] = tail
--- a/src/pyj/read_book/resources.pyj
+++ b/src/pyj/read_book/resources.pyj
@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map):
                        anchor_offset_map[aid] = text_pos
        if no_visit[src.n]:
            continue
-        ignore_text_in_node_and_children = v'!!ignore_text[src.n]'
+        ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
        if not ignore_text_in_node_and_children and src.x:
            ans.push(src.x)
            text_pos += src.x.length