E-book viewer: Fix a regression that broke searching in Japanese books that use <ruby> text. Fixes #2071348 [Error when trying to jump to in-book result location if the displayed result sentence has conterpart in book that has ruby text](https://bugs.launchpad.net/calibre/+bug/2071348)

2025-08-11 09:13:57 -04:00 · 2024-09-18 10:41:05 +05:30 · 2024-09-18 10:41:05 +05:30 · 6170e8c560
commit 6170e8c560
parent 7597538345
2 changed files with 28 additions and 9 deletions
--- a/src/calibre/gui2/viewer/search.py
+++ b/src/calibre/gui2/viewer/search.py
@ -233,14 +233,16 @@ def searchable_text_for_name(name):
    ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
    for child in serialized_data['tree']['c']:
        if child.get('n') == 'body':
-            a((child, False))
+            a((child, False, False))
            # the JS code does not add the tail of body tags to flat text
            removed_tails.append((child.pop('l', None), child))
    text_pos = 0
    anchor_offset_map = OrderedDict()
    while stack:
-        node, text_ignored_in_parent = stack.pop()
+        node, text_ignored_in_parent, in_ruby = stack.pop()
        if isinstance(node, str):
+            if in_ruby:
+                node = node.strip()
            add_text(node)
            text_pos += len(node)
            continue
@ -258,16 +260,21 @@ def searchable_text_for_name(name):
                        anchor_offset_map[aid] = text_pos
        if name in no_visit:
            continue
+        node_in_ruby = in_ruby
+        if not in_ruby and name == 'ruby':
+            in_ruby = True
        ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text

        if text and not ignore_text_in_node_and_children:
+            if in_ruby:
+                text = text.strip()
            add_text(text)
            text_pos += len(text)
        if tail and not text_ignored_in_parent:
-            a((tail, ignore_text_in_node_and_children))
+            a((tail, ignore_text_in_node_and_children, node_in_ruby))
        if children:
            for child in reversed(children):
-                a((child, ignore_text_in_node_and_children))
+                a((child, ignore_text_in_node_and_children, in_ruby))
    for (tail, body) in removed_tails:
        if tail is not None:
            body['l'] = tail
--- a/src/pyj/read_book/find.pyj
+++ b/src/pyj/read_book/find.pyj
@ -4,7 +4,7 @@ from __python__ import bound_methods, hash_literals

 ignored_tags = {
    'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True,
-    'img': True
+    'img': True, 'rt': True, 'rp': True, 'rtc': True,
 }

 block_tags_for_tts = {
@ -16,13 +16,20 @@ def build_text_map(for_tts):
    flat_text = ''
    text_node_type = Node.TEXT_NODE
    element_node_type = Node.ELEMENT_NODE
+    in_ruby = 0

    def process_node(node):
-        nonlocal flat_text
+        nonlocal flat_text, in_ruby
        nt = node.nodeType
        if nt is text_node_type:
            text = node.nodeValue
            if text and text.length:
+                if in_ruby:
+                    rtext = text.trim()
+                    if rtext.length:
+                        node_list.push(v"{node: node, offset: flat_text.length, length: rtext.length, offset_in_node: text.length - text.trimStart().length}")
+                        flat_text += rtext
+                else:
                    node_list.push(v"{node: node, offset: flat_text.length, length: text.length}")
                    flat_text += text
        elif nt is element_node_type:
@ -31,9 +38,14 @@ def build_text_map(for_tts):
            tag = node.tagName.toLowerCase()
            if ignored_tags[tag]:
                return
+            is_ruby_tag = tag is 'ruby'
+            if is_ruby_tag:
+                in_ruby += 1
            children = node.childNodes
            for i in range(children.length):
                process_node(v'children[i]')
+            if is_ruby_tag:
+                in_ruby -= 1
            if for_tts and block_tags_for_tts[tag]:
                # add a paragraph separator after block tags so that sentence splitting works
                if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
@ -106,7 +118,7 @@ def find_node_for_index_binary(node_list, idx_in_flat_text, start):
        if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
            start_node = q.node
            start_offset = idx_in_flat_text - q.offset
-            return start_node, start_offset, mid
+            return start_node, start_offset + (q.offset_in_node or 0), mid
        if limit <= idx_in_flat_text:
            start = mid + 1
        else: