From 6170e8c56089ee10fa1db1ea11c5eba6f5af18ea Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 18 Sep 2024 10:41:05 +0530 Subject: [PATCH] E-book viewer: Fix a regression that broke searching in Japanese books that use text. Fixes #2071348 [Error when trying to jump to in-book result location if the displayed result sentence has conterpart in book that has ruby text](https://bugs.launchpad.net/calibre/+bug/2071348) --- src/calibre/gui2/viewer/search.py | 15 +++++++++++---- src/pyj/read_book/find.pyj | 22 +++++++++++++++++----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/calibre/gui2/viewer/search.py b/src/calibre/gui2/viewer/search.py index 91bac79a3a..4963726096 100644 --- a/src/calibre/gui2/viewer/search.py +++ b/src/calibre/gui2/viewer/search.py @@ -233,14 +233,16 @@ def searchable_text_for_name(name): ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'}) for child in serialized_data['tree']['c']: if child.get('n') == 'body': - a((child, False)) + a((child, False, False)) # the JS code does not add the tail of body tags to flat text removed_tails.append((child.pop('l', None), child)) text_pos = 0 anchor_offset_map = OrderedDict() while stack: - node, text_ignored_in_parent = stack.pop() + node, text_ignored_in_parent, in_ruby = stack.pop() if isinstance(node, str): + if in_ruby: + node = node.strip() add_text(node) text_pos += len(node) continue @@ -258,16 +260,21 @@ def searchable_text_for_name(name): anchor_offset_map[aid] = text_pos if name in no_visit: continue + node_in_ruby = in_ruby + if not in_ruby and name == 'ruby': + in_ruby = True ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text if text and not ignore_text_in_node_and_children: + if in_ruby: + text = text.strip() add_text(text) text_pos += len(text) if tail and not text_ignored_in_parent: - a((tail, ignore_text_in_node_and_children)) + a((tail, ignore_text_in_node_and_children, node_in_ruby)) if children: for child in reversed(children): - a((child, ignore_text_in_node_and_children)) + a((child, ignore_text_in_node_and_children, in_ruby)) for (tail, body) in removed_tails: if tail is not None: body['l'] = tail diff --git a/src/pyj/read_book/find.pyj b/src/pyj/read_book/find.pyj index 2861079e72..a1bfb1eb84 100644 --- a/src/pyj/read_book/find.pyj +++ b/src/pyj/read_book/find.pyj @@ -4,7 +4,7 @@ from __python__ import bound_methods, hash_literals ignored_tags = { 'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True, - 'img': True + 'img': True, 'rt': True, 'rp': True, 'rtc': True, } block_tags_for_tts = { @@ -16,24 +16,36 @@ def build_text_map(for_tts): flat_text = '' text_node_type = Node.TEXT_NODE element_node_type = Node.ELEMENT_NODE + in_ruby = 0 def process_node(node): - nonlocal flat_text + nonlocal flat_text, in_ruby nt = node.nodeType if nt is text_node_type: text = node.nodeValue if text and text.length: - node_list.push(v"{node: node, offset: flat_text.length, length: text.length}") - flat_text += text + if in_ruby: + rtext = text.trim() + if rtext.length: + node_list.push(v"{node: node, offset: flat_text.length, length: rtext.length, offset_in_node: text.length - text.trimStart().length}") + flat_text += rtext + else: + node_list.push(v"{node: node, offset: flat_text.length, length: text.length}") + flat_text += text elif nt is element_node_type: if not node.hasChildNodes(): return tag = node.tagName.toLowerCase() if ignored_tags[tag]: return + is_ruby_tag = tag is 'ruby' + if is_ruby_tag: + in_ruby += 1 children = node.childNodes for i in range(children.length): process_node(v'children[i]') + if is_ruby_tag: + in_ruby -= 1 if for_tts and block_tags_for_tts[tag]: # add a paragraph separator after block tags so that sentence splitting works if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1: @@ -106,7 +118,7 @@ def find_node_for_index_binary(node_list, idx_in_flat_text, start): if q.offset <= idx_in_flat_text and limit > idx_in_flat_text: start_node = q.node start_offset = idx_in_flat_text - q.offset - return start_node, start_offset, mid + return start_node, start_offset + (q.offset_in_node or 0), mid if limit <= idx_in_flat_text: start = mid + 1 else: