E-book viewer: Fix a regression that broke searching in Japanese books that use <ruby> text. Fixes #2071348 [Error when trying to jump to in-book result location if the displayed result sentence has conterpart in book that has ruby text](https://bugs.launchpad.net/calibre/+bug/2071348)

This commit is contained in:
Kovid Goyal 2024-09-18 10:41:05 +05:30
parent 7597538345
commit 6170e8c560
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 28 additions and 9 deletions

View File

@ -233,14 +233,16 @@ def searchable_text_for_name(name):
ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'}) ignore_text = frozenset({'img', 'math', 'rt', 'rp', 'rtc'})
for child in serialized_data['tree']['c']: for child in serialized_data['tree']['c']:
if child.get('n') == 'body': if child.get('n') == 'body':
a((child, False)) a((child, False, False))
# the JS code does not add the tail of body tags to flat text # the JS code does not add the tail of body tags to flat text
removed_tails.append((child.pop('l', None), child)) removed_tails.append((child.pop('l', None), child))
text_pos = 0 text_pos = 0
anchor_offset_map = OrderedDict() anchor_offset_map = OrderedDict()
while stack: while stack:
node, text_ignored_in_parent = stack.pop() node, text_ignored_in_parent, in_ruby = stack.pop()
if isinstance(node, str): if isinstance(node, str):
if in_ruby:
node = node.strip()
add_text(node) add_text(node)
text_pos += len(node) text_pos += len(node)
continue continue
@ -258,16 +260,21 @@ def searchable_text_for_name(name):
anchor_offset_map[aid] = text_pos anchor_offset_map[aid] = text_pos
if name in no_visit: if name in no_visit:
continue continue
node_in_ruby = in_ruby
if not in_ruby and name == 'ruby':
in_ruby = True
ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text ignore_text_in_node_and_children = text_ignored_in_parent or name in ignore_text
if text and not ignore_text_in_node_and_children: if text and not ignore_text_in_node_and_children:
if in_ruby:
text = text.strip()
add_text(text) add_text(text)
text_pos += len(text) text_pos += len(text)
if tail and not text_ignored_in_parent: if tail and not text_ignored_in_parent:
a((tail, ignore_text_in_node_and_children)) a((tail, ignore_text_in_node_and_children, node_in_ruby))
if children: if children:
for child in reversed(children): for child in reversed(children):
a((child, ignore_text_in_node_and_children)) a((child, ignore_text_in_node_and_children, in_ruby))
for (tail, body) in removed_tails: for (tail, body) in removed_tails:
if tail is not None: if tail is not None:
body['l'] = tail body['l'] = tail

View File

@ -4,7 +4,7 @@ from __python__ import bound_methods, hash_literals
ignored_tags = { ignored_tags = {
'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True, 'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True,
'img': True 'img': True, 'rt': True, 'rp': True, 'rtc': True,
} }
block_tags_for_tts = { block_tags_for_tts = {
@ -16,24 +16,36 @@ def build_text_map(for_tts):
flat_text = '' flat_text = ''
text_node_type = Node.TEXT_NODE text_node_type = Node.TEXT_NODE
element_node_type = Node.ELEMENT_NODE element_node_type = Node.ELEMENT_NODE
in_ruby = 0
def process_node(node): def process_node(node):
nonlocal flat_text nonlocal flat_text, in_ruby
nt = node.nodeType nt = node.nodeType
if nt is text_node_type: if nt is text_node_type:
text = node.nodeValue text = node.nodeValue
if text and text.length: if text and text.length:
node_list.push(v"{node: node, offset: flat_text.length, length: text.length}") if in_ruby:
flat_text += text rtext = text.trim()
if rtext.length:
node_list.push(v"{node: node, offset: flat_text.length, length: rtext.length, offset_in_node: text.length - text.trimStart().length}")
flat_text += rtext
else:
node_list.push(v"{node: node, offset: flat_text.length, length: text.length}")
flat_text += text
elif nt is element_node_type: elif nt is element_node_type:
if not node.hasChildNodes(): if not node.hasChildNodes():
return return
tag = node.tagName.toLowerCase() tag = node.tagName.toLowerCase()
if ignored_tags[tag]: if ignored_tags[tag]:
return return
is_ruby_tag = tag is 'ruby'
if is_ruby_tag:
in_ruby += 1
children = node.childNodes children = node.childNodes
for i in range(children.length): for i in range(children.length):
process_node(v'children[i]') process_node(v'children[i]')
if is_ruby_tag:
in_ruby -= 1
if for_tts and block_tags_for_tts[tag]: if for_tts and block_tags_for_tts[tag]:
# add a paragraph separator after block tags so that sentence splitting works # add a paragraph separator after block tags so that sentence splitting works
if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1: if flat_text.length and ' \n\t\r'.indexOf(flat_text[-1]) > -1:
@ -106,7 +118,7 @@ def find_node_for_index_binary(node_list, idx_in_flat_text, start):
if q.offset <= idx_in_flat_text and limit > idx_in_flat_text: if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
start_node = q.node start_node = q.node
start_offset = idx_in_flat_text - q.offset start_offset = idx_in_flat_text - q.offset
return start_node, start_offset, mid return start_node, start_offset + (q.offset_in_node or 0), mid
if limit <= idx_in_flat_text: if limit <= idx_in_flat_text:
start = mid + 1 start = mid + 1
else: else: