From 69cf7e684b196379499c5ac5ef171f719860b325 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 25 May 2024 10:23:44 +0530
Subject: [PATCH] Forgot to exclude ruby tags from searching in the native
 viewer

Also fix ignore_text not recursing and add some performance improvements
to the python function for extracting searchable text
---
 src/calibre/gui2/viewer/search.py | 26 +++++++++++++++++---------
 src/pyj/read_book/resources.pyj   |  2 +-
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/calibre/gui2/viewer/search.py b/src/calibre/gui2/viewer/search.py
index a6bc791c69..91bac79a3a 100644
--- a/src/calibre/gui2/viewer/search.py
+++ b/src/calibre/gui2/viewer/search.py
@@ -224,21 +224,24 @@ class SearchResult:
 @lru_cache(maxsize=None)
 def searchable_text_for_name(name):
     ans = []
+    add_text = ans.append
     serialized_data = json.loads(get_data(name)[0])
     stack = []
+    a = stack.append
     removed_tails = []
+    no_visit = frozenset({'script', 'style', 'title', 'head'})
+    ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
     for child in serialized_data['tree']['c']:
         if child.get('n') == 'body':
-            stack.append(child)
+            a((child, False))
             # the JS code does not add the tail of body tags to flat text
             removed_tails.append((child.pop('l', None), child))
-    ignore_text = {'script', 'style', 'title'}
     text_pos = 0
     anchor_offset_map = OrderedDict()
     while stack:
-        node = stack.pop()
+        node, text_ignored_in_parent = stack.pop()
         if isinstance(node, str):
-            ans.append(node)
+            add_text(node)
             text_pos += len(node)
             continue
         g = node.get
@@ -253,13 +256,18 @@ def searchable_text_for_name(name):
                     aid = x[1]
                     if aid not in anchor_offset_map:
                         anchor_offset_map[aid] = text_pos
-        if name and text and name not in ignore_text:
-            ans.append(text)
+        if name in no_visit:
+            continue
+        ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
+
+        if text and not ignore_text_in_node_and_children:
+            add_text(text)
             text_pos += len(text)
-        if tail:
-            stack.append(tail)
+        if tail and not text_ignored_in_parent:
+            a((tail, ignore_text_in_node_and_children))
         if children:
-            stack.extend(reversed(children))
+            for child in reversed(children):
+                a((child, ignore_text_in_node_and_children))
     for (tail, body) in removed_tails:
         if tail is not None:
             body['l'] = tail
diff --git a/src/pyj/read_book/resources.pyj b/src/pyj/read_book/resources.pyj
index 8d173948f5..cbde88a6f3 100644
--- a/src/pyj/read_book/resources.pyj
+++ b/src/pyj/read_book/resources.pyj
@@ -436,7 +436,7 @@ def text_from_serialized_html(data, get_anchor_offset_map):
                         anchor_offset_map[aid] = text_pos
         if no_visit[src.n]:
             continue
-        ignore_text_in_node_and_children = v'!!ignore_text[src.n]'
+        ignore_text_in_node_and_children = text_ignored_in_parent or v'!!ignore_text[src.n]'
         if not ignore_text_in_node_and_children and src.x:
             ans.push(src.x)
             text_pos += src.x.length