Viewer: Fix searching in Regex and Whole words mode not working well.

Viewer: Fix searching for multiple words in fixed layout books not
working. Fixes #1863464 [Private bug](https://bugs.launchpad.net/calibre/+bug/1863464)
This commit is contained in:
Kovid Goyal 2020-02-28 14:45:07 +05:30
parent 90aba42b2a
commit 2f701318d2
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 167 additions and 79 deletions

View File

@ -53,16 +53,21 @@ class BusySpinner(QWidget): # {{{
quote_map= {'"':'"“”', "'": "'"}
qpat = regex.compile(r'''(['"])''')
spat = regex.compile(r'(\s+)')
def text_to_regex(text):
ans = []
for part in qpat.split(text):
r = quote_map.get(part)
if r is not None:
ans.append('[' + r + ']')
for wpart in spat.split(text):
if not wpart.strip():
ans.append(r'\s+')
else:
ans.append(regex.escape(part))
for part in qpat.split(wpart):
r = quote_map.get(part)
if r is not None:
ans.append('[' + r + ']')
else:
ans.append(regex.escape(part))
return ''.join(ans)
@ -111,10 +116,11 @@ class SearchFinished(object):
class SearchResult(object):
__slots__ = ('search_query', 'before', 'text', 'after', 'spine_idx', 'index', 'file_name', '_static_text')
__slots__ = ('search_query', 'before', 'text', 'after', 'q', 'spine_idx', 'index', 'file_name', '_static_text')
def __init__(self, search_query, before, text, after, name, spine_idx, index):
def __init__(self, search_query, before, text, after, q, name, spine_idx, index):
self.search_query = search_query
self.q = q
self.before, self.text, self.after = before, text, after
self.spine_idx, self.index = spine_idx, index
self.file_name = name
@ -145,8 +151,8 @@ class SearchResult(object):
'before': self.before, 'after': self.after, 'mode': self.search_query.mode
}
def is_or_is_after(self, result_from_js):
return result_from_js['spine_idx'] == self.spine_idx and self.index >= result_from_js['index'] and result_from_js['text'] == self.text
def is_result(self, result_from_js):
return result_from_js['spine_idx'] == self.spine_idx and self.index == result_from_js['index'] and result_from_js['text'] == self.text
def __str__(self):
from collections import namedtuple
@ -179,10 +185,7 @@ def searchable_text_for_name(name):
stack.append(tail)
if children:
stack.extend(reversed(children))
# Normalize whitespace to a single space, this will cause failures
# when searching over spaces in pre nodes, but that is a lesser evil
# since the DOM converts \n, \t etc to a single space
return regex.sub(r'\s+', ' ', ''.join(ans))
return ''.join(ans)
def search_in_name(name, search_query, ctx_size=50):
@ -383,23 +386,24 @@ class Results(QListWidget): # {{{
self.item_activated()
def search_result_not_found(self, sr):
remove = []
remove = None
for i in range(self.count()):
item = self.item(i)
r = item.data(Qt.UserRole)
if r.is_or_is_after(sr):
remove.append(i)
if remove:
last_i = remove[-1]
if last_i < self.count() - 1:
self.setCurrentRow(last_i + 1)
if r.is_result(sr):
remove = i
if remove is not None:
q = sr['spine_idx']
for i in range(remove + 1, self.count()):
item = self.item(i)
r = item.data(Qt.UserRole)
if r.spine_index != q:
break
r.index -= 1
self.takeItem(remove)
if remove < self.count():
self.setCurrentRow(remove)
self.item_activated()
elif remove[0] > 0:
self.setCurrentRow(remove[0] - 1)
self.item_activated()
for i in reversed(remove):
self.takeItem(i)
# }}}
@ -469,8 +473,9 @@ class SearchPanel(QWidget): # {{{
try:
for i, result in enumerate(search_in_name(name, search_query)):
before, text, after = result
self.results_found.emit(SearchResult(search_query, before, text, after, name, spine_idx, counter[text]))
counter[text] += 1
q = (before or '')[-5:] + text + (after or '')[:5]
self.results_found.emit(SearchResult(search_query, before, text, after, q, name, spine_idx, counter[q]))
counter[q] += 1
except Exception:
import traceback
traceback.print_exc()

128
src/pyj/read_book/find.pyj Normal file
View File

@ -0,0 +1,128 @@
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
from __python__ import bound_methods, hash_literals
def build_text_map():
node_list = v'[]'
flat_text = ''
ignored_tags = {
'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True,
'img': True
}
def process_node(node):
nonlocal flat_text
if node.nodeType is Node.TEXT_NODE:
text = node.nodeValue
if text and text.length:
node_list.push({'node': node, 'offset': flat_text.length, 'length': text.length})
flat_text += text
elif node.nodeType is Node.ELEMENT_NODE:
if not node.hasChildNodes():
return
tag = node.tagName.toLowerCase()
if ignored_tags[tag]:
return
style = window.getComputedStyle(node)
if style.display is 'none' or style.visibility is 'hidden':
return
children = node.childNodes
for i in range(children.length):
process_node(v'children[i]')
process_node(document.body)
return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}
def find_node_for_index_binary(node_list, idx_in_flat_text, start):
# Do a binary search for idx
start = start or 0
end = node_list.length - 1
while start <= end:
mid = Math.floor((start + end)/2)
q = node_list[mid]
limit = q.offset + q.length
if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
start_node = q.node
start_offset = idx_in_flat_text - q.offset
return start_node, start_offset, mid
if limit <= idx_in_flat_text:
start = mid + 1
else:
end = mid - 1
return None, None, None
def find_node_for_index_linear(node_list, idx_in_flat_text, start):
start = start or 0
for i in range(start, node_list.length):
q = node_list[i]
limit = q.offset + q.length
if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
start_node = q.node
start_offset = idx_in_flat_text - q.offset
return start_node, start_offset, i
return None, None, None
def find_specific_occurrence(q, num, before_len, after_len, text_map):
if not q or not q.length:
return
from_idx = 0
flat_text = text_map.flat_text
pos = 0
match_num = -1
while True:
idx = flat_text.indexOf(q, from_idx)
if idx < 0:
break
match_num += 1
from_idx = idx + 1
if num < match_num:
continue
start_node, start_offset, node_pos = find_node_for_index_binary(text_map.node_list, idx + before_len, pos)
if start_node is not None:
pos = node_pos
end_node, end_offset, node_pos = find_node_for_index_linear(text_map.node_list, idx + q.length - after_len, pos)
if end_node is not None:
return {
'start_node': start_node, 'start_offset': start_offset, 'start_pos': pos,
'end_node': end_node, 'end_offset': end_offset, 'end_pos': node_pos,
'idx_in_flat_text': idx
}
break
cache = {}
def reset_find_caches():
nonlocal cache
cache = {}
def select_find_result(match):
sel = window.getSelection()
sel.setBaseAndExtent(match.start_node, match.start_offset, match.end_node, match.end_offset)
def select_search_result(sr):
window.getSelection().removeAllRanges()
if not cache.text_map:
cache.text_map = build_text_map()
q = ''
before_len = after_len = 0
if sr.before:
q = sr.before[-5:]
before_len = q.length
q += sr.text
if sr.after:
after = sr.after[:5]
after_len = after.length
q += after
match = find_specific_occurrence(q, int(sr.index), before_len, after_len, cache.text_map)
if not match:
return False
select_find_result(match)
return True

View File

@ -9,6 +9,7 @@ from fs_images import fix_fullscreen_svg_images
from iframe_comm import IframeClient
from read_book.cfi import scroll_to as scroll_to_cfi
from read_book.extract import get_elements
from read_book.find import reset_find_caches, select_search_result
from read_book.flow_mode import (
anchor_funcs as flow_anchor_funcs, auto_scroll_action as flow_auto_scroll_action,
flow_onwheel, flow_to_scroll_fraction, handle_gesture as flow_handle_gesture,
@ -49,9 +50,7 @@ from read_book.touch import (
create_handlers as create_touch_handlers, reset_handlers as reset_touch_handlers
)
from read_book.viewport import scroll_viewport
from utils import (
apply_cloned_selection, clone_selection, debounce, html_escape, is_ios
)
from utils import debounce, html_escape, is_ios
FORCE_FLOW_MODE = False
CALIBRE_VERSION = '__CALIBRE_VERSION__'
@ -339,6 +338,7 @@ class IframeBoss:
self.content_loaded_stage2()
def content_loaded_stage2(self):
reset_find_caches()
self.connect_links()
self.content_ready = True
# this is the loading styles used to suppress scrollbars during load
@ -580,39 +580,9 @@ class IframeBoss:
self.send_message('find_in_spine', text=data.text, backwards=data.backwards, searched_in_spine=data.searched_in_spine)
def show_search_result(self, data, from_load):
sr = data.search_result
idx = -1
window.getSelection().removeAllRanges()
while idx < sr.index:
if not window.find(sr.text, True, False, False, False, False):
self.send_message('search_result_not_found', search_result=sr)
break
if sr.mode is not 'normal':
# verify we have the correct match since regexes can have
# boundary conditions
sel = window.getSelection()
ranges = clone_selection(sel)
r = ranges[0]
if sr.before:
p = r.cloneRange()
p.collapse(True)
sel = apply_cloned_selection(v'[p]')
sel.modify('extend', 'left', 'character')
if sel.toString() is not sr.before[-1]:
apply_cloned_selection(ranges)
continue
if sr.after:
p = r.cloneRange()
p.collapse(False)
sel = apply_cloned_selection(v'[p]')
sel.modify('extend', 'right', 'character')
if sel.toString() is not sr.after[0]:
apply_cloned_selection(ranges)
continue
apply_cloned_selection(ranges)
idx += 1
if idx > -1 and current_layout_mode() is not 'flow':
snap_to_selection()
if select_search_result(data.search_result):
if current_layout_mode() is not 'flow':
snap_to_selection()
def reference_item_changed(self, ref_num_or_none):
self.send_message('reference_item_changed', refnum=ref_num_or_none, index=current_spine_item().index)

View File

@ -252,21 +252,6 @@ def sandboxed_html(html, style, sandbox):
return ans
def clone_selection(sel):
ans = v'[]'
for i in range(sel.rangeCount):
ans.push(sel.getRangeAt(i).cloneRange())
return ans
def apply_cloned_selection(ranges):
sel = window.getSelection()
sel.removeAllRanges()
for r in ranges:
sel.addRange(r)
return sel
if __name__ is '__main__':
from pythonize import strings
strings()