From 2f701318d253942039aefa53dc60bce759b7187a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 28 Feb 2020 14:45:07 +0530 Subject: [PATCH] Viewer: Fix searching in Regex and Whole words mode not working well. Viewer: Fix searching for multiple words in fixed layout books not working. Fixes #1863464 [Private bug](https://bugs.launchpad.net/calibre/+bug/1863464) --- src/calibre/gui2/viewer/search.py | 61 +++++++------- src/pyj/read_book/find.pyj | 128 ++++++++++++++++++++++++++++++ src/pyj/read_book/iframe.pyj | 42 ++-------- src/pyj/utils.pyj | 15 ---- 4 files changed, 167 insertions(+), 79 deletions(-) create mode 100644 src/pyj/read_book/find.pyj diff --git a/src/calibre/gui2/viewer/search.py b/src/calibre/gui2/viewer/search.py index b712372968..50dcac5df3 100644 --- a/src/calibre/gui2/viewer/search.py +++ b/src/calibre/gui2/viewer/search.py @@ -53,16 +53,21 @@ class BusySpinner(QWidget): # {{{ quote_map= {'"':'"“”', "'": "'‘’"} qpat = regex.compile(r'''(['"])''') +spat = regex.compile(r'(\s+)') def text_to_regex(text): ans = [] - for part in qpat.split(text): - r = quote_map.get(part) - if r is not None: - ans.append('[' + r + ']') + for wpart in spat.split(text): + if not wpart.strip(): + ans.append(r'\s+') else: - ans.append(regex.escape(part)) + for part in qpat.split(wpart): + r = quote_map.get(part) + if r is not None: + ans.append('[' + r + ']') + else: + ans.append(regex.escape(part)) return ''.join(ans) @@ -111,10 +116,11 @@ class SearchFinished(object): class SearchResult(object): - __slots__ = ('search_query', 'before', 'text', 'after', 'spine_idx', 'index', 'file_name', '_static_text') + __slots__ = ('search_query', 'before', 'text', 'after', 'q', 'spine_idx', 'index', 'file_name', '_static_text') - def __init__(self, search_query, before, text, after, name, spine_idx, index): + def __init__(self, search_query, before, text, after, q, name, spine_idx, index): self.search_query = search_query + self.q = q self.before, self.text, self.after = before, text, after self.spine_idx, self.index = spine_idx, index self.file_name = name @@ -145,8 +151,8 @@ class SearchResult(object): 'before': self.before, 'after': self.after, 'mode': self.search_query.mode } - def is_or_is_after(self, result_from_js): - return result_from_js['spine_idx'] == self.spine_idx and self.index >= result_from_js['index'] and result_from_js['text'] == self.text + def is_result(self, result_from_js): + return result_from_js['spine_idx'] == self.spine_idx and self.index == result_from_js['index'] and result_from_js['text'] == self.text def __str__(self): from collections import namedtuple @@ -179,10 +185,7 @@ def searchable_text_for_name(name): stack.append(tail) if children: stack.extend(reversed(children)) - # Normalize whitespace to a single space, this will cause failures - # when searching over spaces in pre nodes, but that is a lesser evil - # since the DOM converts \n, \t etc to a single space - return regex.sub(r'\s+', ' ', ''.join(ans)) + return ''.join(ans) def search_in_name(name, search_query, ctx_size=50): @@ -383,23 +386,24 @@ class Results(QListWidget): # {{{ self.item_activated() def search_result_not_found(self, sr): - remove = [] + remove = None for i in range(self.count()): item = self.item(i) r = item.data(Qt.UserRole) - if r.is_or_is_after(sr): - remove.append(i) - if remove: - last_i = remove[-1] - if last_i < self.count() - 1: - self.setCurrentRow(last_i + 1) + if r.is_result(sr): + remove = i + if remove is not None: + q = sr['spine_idx'] + for i in range(remove + 1, self.count()): + item = self.item(i) + r = item.data(Qt.UserRole) + if r.spine_index != q: + break + r.index -= 1 + self.takeItem(remove) + if remove < self.count(): + self.setCurrentRow(remove) self.item_activated() - elif remove[0] > 0: - self.setCurrentRow(remove[0] - 1) - self.item_activated() - for i in reversed(remove): - self.takeItem(i) - # }}} @@ -469,8 +473,9 @@ class SearchPanel(QWidget): # {{{ try: for i, result in enumerate(search_in_name(name, search_query)): before, text, after = result - self.results_found.emit(SearchResult(search_query, before, text, after, name, spine_idx, counter[text])) - counter[text] += 1 + q = (before or '')[-5:] + text + (after or '')[:5] + self.results_found.emit(SearchResult(search_query, before, text, after, q, name, spine_idx, counter[q])) + counter[q] += 1 except Exception: import traceback traceback.print_exc() diff --git a/src/pyj/read_book/find.pyj b/src/pyj/read_book/find.pyj new file mode 100644 index 0000000000..03ea74f69d --- /dev/null +++ b/src/pyj/read_book/find.pyj @@ -0,0 +1,128 @@ +# vim:fileencoding=utf-8 +# License: GPL v3 Copyright: 2020, Kovid Goyal +from __python__ import bound_methods, hash_literals + + +def build_text_map(): + node_list = v'[]' + flat_text = '' + ignored_tags = { + 'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True, + 'img': True + } + + def process_node(node): + nonlocal flat_text + if node.nodeType is Node.TEXT_NODE: + text = node.nodeValue + if text and text.length: + node_list.push({'node': node, 'offset': flat_text.length, 'length': text.length}) + flat_text += text + elif node.nodeType is Node.ELEMENT_NODE: + if not node.hasChildNodes(): + return + tag = node.tagName.toLowerCase() + if ignored_tags[tag]: + return + style = window.getComputedStyle(node) + if style.display is 'none' or style.visibility is 'hidden': + return + children = node.childNodes + for i in range(children.length): + process_node(v'children[i]') + + process_node(document.body) + return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list} + + +def find_node_for_index_binary(node_list, idx_in_flat_text, start): + # Do a binary search for idx + start = start or 0 + end = node_list.length - 1 + while start <= end: + mid = Math.floor((start + end)/2) + q = node_list[mid] + limit = q.offset + q.length + if q.offset <= idx_in_flat_text and limit > idx_in_flat_text: + start_node = q.node + start_offset = idx_in_flat_text - q.offset + return start_node, start_offset, mid + if limit <= idx_in_flat_text: + start = mid + 1 + else: + end = mid - 1 + return None, None, None + + +def find_node_for_index_linear(node_list, idx_in_flat_text, start): + start = start or 0 + for i in range(start, node_list.length): + q = node_list[i] + limit = q.offset + q.length + if q.offset <= idx_in_flat_text and limit > idx_in_flat_text: + start_node = q.node + start_offset = idx_in_flat_text - q.offset + return start_node, start_offset, i + return None, None, None + + +def find_specific_occurrence(q, num, before_len, after_len, text_map): + if not q or not q.length: + return + from_idx = 0 + flat_text = text_map.flat_text + pos = 0 + match_num = -1 + while True: + idx = flat_text.indexOf(q, from_idx) + if idx < 0: + break + match_num += 1 + from_idx = idx + 1 + if num < match_num: + continue + start_node, start_offset, node_pos = find_node_for_index_binary(text_map.node_list, idx + before_len, pos) + if start_node is not None: + pos = node_pos + end_node, end_offset, node_pos = find_node_for_index_linear(text_map.node_list, idx + q.length - after_len, pos) + if end_node is not None: + return { + 'start_node': start_node, 'start_offset': start_offset, 'start_pos': pos, + 'end_node': end_node, 'end_offset': end_offset, 'end_pos': node_pos, + 'idx_in_flat_text': idx + } + break + + +cache = {} + + +def reset_find_caches(): + nonlocal cache + cache = {} + + +def select_find_result(match): + sel = window.getSelection() + sel.setBaseAndExtent(match.start_node, match.start_offset, match.end_node, match.end_offset) + + +def select_search_result(sr): + window.getSelection().removeAllRanges() + if not cache.text_map: + cache.text_map = build_text_map() + q = '' + before_len = after_len = 0 + if sr.before: + q = sr.before[-5:] + before_len = q.length + q += sr.text + if sr.after: + after = sr.after[:5] + after_len = after.length + q += after + match = find_specific_occurrence(q, int(sr.index), before_len, after_len, cache.text_map) + if not match: + return False + select_find_result(match) + return True diff --git a/src/pyj/read_book/iframe.pyj b/src/pyj/read_book/iframe.pyj index 11a7e5ff90..40f55f0bcf 100644 --- a/src/pyj/read_book/iframe.pyj +++ b/src/pyj/read_book/iframe.pyj @@ -9,6 +9,7 @@ from fs_images import fix_fullscreen_svg_images from iframe_comm import IframeClient from read_book.cfi import scroll_to as scroll_to_cfi from read_book.extract import get_elements +from read_book.find import reset_find_caches, select_search_result from read_book.flow_mode import ( anchor_funcs as flow_anchor_funcs, auto_scroll_action as flow_auto_scroll_action, flow_onwheel, flow_to_scroll_fraction, handle_gesture as flow_handle_gesture, @@ -49,9 +50,7 @@ from read_book.touch import ( create_handlers as create_touch_handlers, reset_handlers as reset_touch_handlers ) from read_book.viewport import scroll_viewport -from utils import ( - apply_cloned_selection, clone_selection, debounce, html_escape, is_ios -) +from utils import debounce, html_escape, is_ios FORCE_FLOW_MODE = False CALIBRE_VERSION = '__CALIBRE_VERSION__' @@ -339,6 +338,7 @@ class IframeBoss: self.content_loaded_stage2() def content_loaded_stage2(self): + reset_find_caches() self.connect_links() self.content_ready = True # this is the loading styles used to suppress scrollbars during load @@ -580,39 +580,9 @@ class IframeBoss: self.send_message('find_in_spine', text=data.text, backwards=data.backwards, searched_in_spine=data.searched_in_spine) def show_search_result(self, data, from_load): - sr = data.search_result - idx = -1 - window.getSelection().removeAllRanges() - while idx < sr.index: - if not window.find(sr.text, True, False, False, False, False): - self.send_message('search_result_not_found', search_result=sr) - break - if sr.mode is not 'normal': - # verify we have the correct match since regexes can have - # boundary conditions - sel = window.getSelection() - ranges = clone_selection(sel) - r = ranges[0] - if sr.before: - p = r.cloneRange() - p.collapse(True) - sel = apply_cloned_selection(v'[p]') - sel.modify('extend', 'left', 'character') - if sel.toString() is not sr.before[-1]: - apply_cloned_selection(ranges) - continue - if sr.after: - p = r.cloneRange() - p.collapse(False) - sel = apply_cloned_selection(v'[p]') - sel.modify('extend', 'right', 'character') - if sel.toString() is not sr.after[0]: - apply_cloned_selection(ranges) - continue - apply_cloned_selection(ranges) - idx += 1 - if idx > -1 and current_layout_mode() is not 'flow': - snap_to_selection() + if select_search_result(data.search_result): + if current_layout_mode() is not 'flow': + snap_to_selection() def reference_item_changed(self, ref_num_or_none): self.send_message('reference_item_changed', refnum=ref_num_or_none, index=current_spine_item().index) diff --git a/src/pyj/utils.pyj b/src/pyj/utils.pyj index 625f5524e3..93e1747e10 100644 --- a/src/pyj/utils.pyj +++ b/src/pyj/utils.pyj @@ -252,21 +252,6 @@ def sandboxed_html(html, style, sandbox): return ans -def clone_selection(sel): - ans = v'[]' - for i in range(sel.rangeCount): - ans.push(sel.getRangeAt(i).cloneRange()) - return ans - - -def apply_cloned_selection(ranges): - sel = window.getSelection() - sel.removeAllRanges() - for r in ranges: - sel.addRange(r) - return sel - - if __name__ is '__main__': from pythonize import strings strings()