From 6170e8c56089ee10fa1db1ea11c5eba6f5af18ea Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 18 Sep 2024 10:41:05 +0530
Subject: [PATCH] E-book viewer: Fix a regression that broke searching in
 Japanese books that use <ruby> text. Fixes #2071348 [Error when trying to
 jump to in-book result location  if the displayed result sentence has
 conterpart in book that has ruby
 text](https://bugs.launchpad.net/calibre/+bug/2071348)

---
 src/calibre/gui2/viewer/search.py | 15 +++++++++++----
 src/pyj/read_book/find.pyj        | 22 +++++++++++++++++-----
 2 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/src/calibre/gui2/viewer/search.py b/src/calibre/gui2/viewer/search.py
index 91bac79a3a..4963726096 100644
--- a/src/calibre/gui2/viewer/search.py
+++ b/src/calibre/gui2/viewer/search.py
@@ -233,14 +233,16 @@ def searchable_text_for_name(name):
     ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
     for child in serialized_data['tree']['c']:
         if child.get('n') == 'body':
-            a((child, False))
+            a((child, False, False))
             # the JS code does not add the tail of body tags to flat text
             removed_tails.append((child.pop('l', None), child))
     text_pos = 0
     anchor_offset_map = OrderedDict()
     while stack:
-        node, text_ignored_in_parent = stack.pop()
+        node, text_ignored_in_parent, in_ruby = stack.pop()
         if isinstance(node, str):
+            if in_ruby:
+                node = node.strip()
             add_text(node)
             text_pos += len(node)
             continue
@@ -258,16 +260,21 @@ def searchable_text_for_name(name):
                         anchor_offset_map[aid] = text_pos
         if name in no_visit:
             continue
+        node_in_ruby = in_ruby
+        if not in_ruby and name == 'ruby':
+            in_ruby = True
         ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
 
         if text and not ignore_text_in_node_and_children:
+            if in_ruby:
+                text = text.strip()
             add_text(text)
             text_pos += len(text)
         if tail and not text_ignored_in_parent:
-            a((tail, ignore_text_in_node_and_children))
+            a((tail, ignore_text_in_node_and_children, node_in_ruby))
         if children:
             for child in reversed(children):
-                a((child, ignore_text_in_node_and_children))
+                a((child, ignore_text_in_node_and_children, in_ruby))
     for (tail, body) in removed_tails:
         if tail is not None:
             body['l'] = tail
diff --git a/src/pyj/read_book/find.pyj b/src/pyj/read_book/find.pyj
index 2861079e72..a1bfb1eb84 100644
--- a/src/pyj/read_book/find.pyj
+++ b/src/pyj/read_book/find.pyj
@@ -4,7 +4,7 @@ from __python__ import bound_methods, hash_literals
 
 ignored_tags = {
     'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True,
-    'img': True
+    'img': True, 'rt': True, 'rp': True, 'rtc': True,
 }
 
 block_tags_for_tts = {
@@ -16,24 +16,36 @@ def build_text_map(for_tts):
     flat_text = ''
     text_node_type = Node.TEXT_NODE
     element_node_type = Node.ELEMENT_NODE
+    in_ruby = 0
 
     def process_node(node):
-        nonlocal flat_text
+        nonlocal flat_text, in_ruby
         nt = node.nodeType
         if nt is text_node_type:
             text = node.nodeValue
             if text and text.length:
-                node_list.push(v"{node: node, offset: flat_text.length, length: text.length}")
-                flat_text += text
+                if in_ruby:
+                    rtext = text.trim()
+                    if rtext.length:
+                        node_list.push(v"{node: node, offset: flat_text.length, length: rtext.length, offset_in_node: text.length - text.trimStart().length}")
+                        flat_text += rtext
+                else:
+                    node_list.push(v"{node: node, offset: flat_text.length, length: text.length}")
+                    flat_text += text
         elif nt is element_node_type:
             if not node.hasChildNodes():
                 return
             tag = node.tagName.toLowerCase()
             if ignored_tags[tag]:
                 return
+            is_ruby_tag = tag is 'ruby'
+            if is_ruby_tag:
+                in_ruby += 1
             children = node.childNodes
             for i in range(children.length):
                 process_node(v'children[i]')
+            if is_ruby_tag:
+                in_ruby -= 1
             if for_tts and block_tags_for_tts[tag]:
                 # add a paragraph separator after block tags so that sentence splitting works
                 if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
@@ -106,7 +118,7 @@ def find_node_for_index_binary(node_list, idx_in_flat_text, start):
         if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
             start_node = q.node
             start_offset = idx_in_flat_text - q.offset
-            return start_node, start_offset, mid
+            return start_node, start_offset + (q.offset_in_node or 0), mid
         if limit <= idx_in_flat_text:
             start = mid + 1
         else: