From 2f701318d253942039aefa53dc60bce759b7187a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 28 Feb 2020 14:45:07 +0530
Subject: [PATCH] Viewer: Fix searching in Regex and Whole words mode not
 working well.

Viewer: Fix searching for multiple words in fixed layout books not
working. Fixes #1863464 [Private bug](https://bugs.launchpad.net/calibre/+bug/1863464)
---
 src/calibre/gui2/viewer/search.py |  61 +++++++-------
 src/pyj/read_book/find.pyj        | 128 ++++++++++++++++++++++++++++++
 src/pyj/read_book/iframe.pyj      |  42 ++--------
 src/pyj/utils.pyj                 |  15 ----
 4 files changed, 167 insertions(+), 79 deletions(-)
 create mode 100644 src/pyj/read_book/find.pyj

diff --git a/src/calibre/gui2/viewer/search.py b/src/calibre/gui2/viewer/search.py
index b712372968..50dcac5df3 100644
--- a/src/calibre/gui2/viewer/search.py
+++ b/src/calibre/gui2/viewer/search.py
@@ -53,16 +53,21 @@ class BusySpinner(QWidget):  # {{{
 
 quote_map= {'"':'"“”', "'": "'‘’"}
 qpat = regex.compile(r'''(['"])''')
+spat = regex.compile(r'(\s+)')
 
 
 def text_to_regex(text):
     ans = []
-    for part in qpat.split(text):
-        r = quote_map.get(part)
-        if r is not None:
-            ans.append('[' + r + ']')
+    for wpart in spat.split(text):
+        if not wpart.strip():
+            ans.append(r'\s+')
         else:
-            ans.append(regex.escape(part))
+            for part in qpat.split(wpart):
+                r = quote_map.get(part)
+                if r is not None:
+                    ans.append('[' + r + ']')
+                else:
+                    ans.append(regex.escape(part))
     return ''.join(ans)
 
 
@@ -111,10 +116,11 @@ class SearchFinished(object):
 
 class SearchResult(object):
 
-    __slots__ = ('search_query', 'before', 'text', 'after', 'spine_idx', 'index', 'file_name', '_static_text')
+    __slots__ = ('search_query', 'before', 'text', 'after', 'q', 'spine_idx', 'index', 'file_name', '_static_text')
 
-    def __init__(self, search_query, before, text, after, name, spine_idx, index):
+    def __init__(self, search_query, before, text, after, q, name, spine_idx, index):
         self.search_query = search_query
+        self.q = q
         self.before, self.text, self.after = before, text, after
         self.spine_idx, self.index = spine_idx, index
         self.file_name = name
@@ -145,8 +151,8 @@ class SearchResult(object):
             'before': self.before, 'after': self.after, 'mode': self.search_query.mode
         }
 
-    def is_or_is_after(self, result_from_js):
-        return result_from_js['spine_idx'] == self.spine_idx and self.index >= result_from_js['index'] and result_from_js['text'] == self.text
+    def is_result(self, result_from_js):
+        return result_from_js['spine_idx'] == self.spine_idx and self.index == result_from_js['index'] and result_from_js['text'] == self.text
 
     def __str__(self):
         from collections import namedtuple
@@ -179,10 +185,7 @@ def searchable_text_for_name(name):
             stack.append(tail)
         if children:
             stack.extend(reversed(children))
-    # Normalize whitespace to a single space, this will cause failures
-    # when searching over spaces in pre nodes, but that is a lesser evil
-    # since the DOM converts \n, \t etc to a single space
-    return regex.sub(r'\s+', ' ', ''.join(ans))
+    return ''.join(ans)
 
 
 def search_in_name(name, search_query, ctx_size=50):
@@ -383,23 +386,24 @@ class Results(QListWidget):  # {{{
         self.item_activated()
 
     def search_result_not_found(self, sr):
-        remove = []
+        remove = None
         for i in range(self.count()):
             item = self.item(i)
             r = item.data(Qt.UserRole)
-            if r.is_or_is_after(sr):
-                remove.append(i)
-        if remove:
-            last_i = remove[-1]
-            if last_i < self.count() - 1:
-                self.setCurrentRow(last_i + 1)
+            if r.is_result(sr):
+                remove = i
+        if remove is not None:
+            q = sr['spine_idx']
+            for i in range(remove + 1, self.count()):
+                item = self.item(i)
+                r = item.data(Qt.UserRole)
+                if r.spine_index != q:
+                    break
+                r.index -= 1
+            self.takeItem(remove)
+            if remove < self.count():
+                self.setCurrentRow(remove)
                 self.item_activated()
-            elif remove[0] > 0:
-                self.setCurrentRow(remove[0] - 1)
-                self.item_activated()
-            for i in reversed(remove):
-                self.takeItem(i)
-
 # }}}
 
 
@@ -469,8 +473,9 @@ class SearchPanel(QWidget):  # {{{
                 try:
                     for i, result in enumerate(search_in_name(name, search_query)):
                         before, text, after = result
-                        self.results_found.emit(SearchResult(search_query, before, text, after, name, spine_idx, counter[text]))
-                        counter[text] += 1
+                        q = (before or '')[-5:] + text + (after or '')[:5]
+                        self.results_found.emit(SearchResult(search_query, before, text, after, q, name, spine_idx, counter[q]))
+                        counter[q] += 1
                 except Exception:
                     import traceback
                     traceback.print_exc()
diff --git a/src/pyj/read_book/find.pyj b/src/pyj/read_book/find.pyj
new file mode 100644
index 0000000000..03ea74f69d
--- /dev/null
+++ b/src/pyj/read_book/find.pyj
@@ -0,0 +1,128 @@
+# vim:fileencoding=utf-8
+# License: GPL v3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
+from __python__ import bound_methods, hash_literals
+
+
+def build_text_map():
+    node_list = v'[]'
+    flat_text = ''
+    ignored_tags = {
+        'style': True, 'script': True, 'noscript': True, 'title': True, 'meta': True, 'head': True, 'link': True, 'html': True,
+        'img': True
+    }
+
+    def process_node(node):
+        nonlocal flat_text
+        if node.nodeType is Node.TEXT_NODE:
+            text = node.nodeValue
+            if text and text.length:
+                node_list.push({'node': node, 'offset': flat_text.length, 'length': text.length})
+                flat_text += text
+        elif node.nodeType is Node.ELEMENT_NODE:
+            if not node.hasChildNodes():
+                return
+            tag = node.tagName.toLowerCase()
+            if ignored_tags[tag]:
+                return
+            style = window.getComputedStyle(node)
+            if style.display is 'none' or style.visibility is 'hidden':
+                return
+            children = node.childNodes
+            for i in range(children.length):
+                process_node(v'children[i]')
+
+    process_node(document.body)
+    return {'timestamp': window.performance.now(), 'flat_text': flat_text, 'node_list': node_list}
+
+
+def find_node_for_index_binary(node_list, idx_in_flat_text, start):
+    # Do a binary search for idx
+    start = start or 0
+    end = node_list.length - 1
+    while start <= end:
+        mid = Math.floor((start + end)/2)
+        q = node_list[mid]
+        limit = q.offset + q.length
+        if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
+            start_node = q.node
+            start_offset = idx_in_flat_text - q.offset
+            return start_node, start_offset, mid
+        if limit <= idx_in_flat_text:
+            start = mid + 1
+        else:
+            end = mid - 1
+    return None, None, None
+
+
+def find_node_for_index_linear(node_list, idx_in_flat_text, start):
+    start = start or 0
+    for i in range(start, node_list.length):
+        q = node_list[i]
+        limit = q.offset + q.length
+        if q.offset <= idx_in_flat_text and limit > idx_in_flat_text:
+            start_node = q.node
+            start_offset = idx_in_flat_text - q.offset
+            return start_node, start_offset, i
+    return None, None, None
+
+
+def find_specific_occurrence(q, num, before_len, after_len, text_map):
+    if not q or not q.length:
+        return
+    from_idx = 0
+    flat_text = text_map.flat_text
+    pos = 0
+    match_num = -1
+    while True:
+        idx = flat_text.indexOf(q, from_idx)
+        if idx < 0:
+            break
+        match_num += 1
+        from_idx = idx + 1
+        if num < match_num:
+            continue
+        start_node, start_offset, node_pos = find_node_for_index_binary(text_map.node_list, idx + before_len, pos)
+        if start_node is not None:
+            pos = node_pos
+            end_node, end_offset, node_pos = find_node_for_index_linear(text_map.node_list, idx + q.length - after_len, pos)
+            if end_node is not None:
+                return {
+                    'start_node': start_node, 'start_offset': start_offset, 'start_pos': pos,
+                    'end_node': end_node, 'end_offset': end_offset, 'end_pos': node_pos,
+                    'idx_in_flat_text': idx
+                }
+        break
+
+
+cache = {}
+
+
+def reset_find_caches():
+    nonlocal cache
+    cache = {}
+
+
+def select_find_result(match):
+    sel = window.getSelection()
+    sel.setBaseAndExtent(match.start_node, match.start_offset, match.end_node, match.end_offset)
+
+
+def select_search_result(sr):
+    window.getSelection().removeAllRanges()
+    if not cache.text_map:
+        cache.text_map = build_text_map()
+    q = ''
+    before_len = after_len = 0
+    if sr.before:
+        q = sr.before[-5:]
+        before_len = q.length
+    q += sr.text
+    if sr.after:
+        after = sr.after[:5]
+        after_len = after.length
+        q += after
+    match = find_specific_occurrence(q, int(sr.index), before_len, after_len, cache.text_map)
+    if not match:
+        return False
+    select_find_result(match)
+    return True
diff --git a/src/pyj/read_book/iframe.pyj b/src/pyj/read_book/iframe.pyj
index 11a7e5ff90..40f55f0bcf 100644
--- a/src/pyj/read_book/iframe.pyj
+++ b/src/pyj/read_book/iframe.pyj
@@ -9,6 +9,7 @@ from fs_images import fix_fullscreen_svg_images
 from iframe_comm import IframeClient
 from read_book.cfi import scroll_to as scroll_to_cfi
 from read_book.extract import get_elements
+from read_book.find import reset_find_caches, select_search_result
 from read_book.flow_mode import (
     anchor_funcs as flow_anchor_funcs, auto_scroll_action as flow_auto_scroll_action,
     flow_onwheel, flow_to_scroll_fraction, handle_gesture as flow_handle_gesture,
@@ -49,9 +50,7 @@ from read_book.touch import (
     create_handlers as create_touch_handlers, reset_handlers as reset_touch_handlers
 )
 from read_book.viewport import scroll_viewport
-from utils import (
-    apply_cloned_selection, clone_selection, debounce, html_escape, is_ios
-)
+from utils import debounce, html_escape, is_ios
 
 FORCE_FLOW_MODE = False
 CALIBRE_VERSION = '__CALIBRE_VERSION__'
@@ -339,6 +338,7 @@ class IframeBoss:
         self.content_loaded_stage2()
 
     def content_loaded_stage2(self):
+        reset_find_caches()
         self.connect_links()
         self.content_ready = True
         # this is the loading styles used to suppress scrollbars during load
@@ -580,39 +580,9 @@ class IframeBoss:
                 self.send_message('find_in_spine', text=data.text, backwards=data.backwards, searched_in_spine=data.searched_in_spine)
 
     def show_search_result(self, data, from_load):
-        sr = data.search_result
-        idx = -1
-        window.getSelection().removeAllRanges()
-        while idx < sr.index:
-            if not window.find(sr.text, True, False, False, False, False):
-                self.send_message('search_result_not_found', search_result=sr)
-                break
-            if sr.mode is not 'normal':
-                # verify we have the correct match since regexes can have
-                # boundary conditions
-                sel = window.getSelection()
-                ranges = clone_selection(sel)
-                r = ranges[0]
-                if sr.before:
-                    p = r.cloneRange()
-                    p.collapse(True)
-                    sel = apply_cloned_selection(v'[p]')
-                    sel.modify('extend', 'left', 'character')
-                    if sel.toString() is not sr.before[-1]:
-                        apply_cloned_selection(ranges)
-                        continue
-                if sr.after:
-                    p = r.cloneRange()
-                    p.collapse(False)
-                    sel = apply_cloned_selection(v'[p]')
-                    sel.modify('extend', 'right', 'character')
-                    if sel.toString() is not sr.after[0]:
-                        apply_cloned_selection(ranges)
-                        continue
-                apply_cloned_selection(ranges)
-            idx += 1
-        if idx > -1 and current_layout_mode() is not 'flow':
-            snap_to_selection()
+        if select_search_result(data.search_result):
+            if current_layout_mode() is not 'flow':
+                snap_to_selection()
 
     def reference_item_changed(self, ref_num_or_none):
         self.send_message('reference_item_changed', refnum=ref_num_or_none, index=current_spine_item().index)
diff --git a/src/pyj/utils.pyj b/src/pyj/utils.pyj
index 625f5524e3..93e1747e10 100644
--- a/src/pyj/utils.pyj
+++ b/src/pyj/utils.pyj
@@ -252,21 +252,6 @@ def sandboxed_html(html, style, sandbox):
     return ans
 
 
-def clone_selection(sel):
-    ans = v'[]'
-    for i in range(sel.rangeCount):
-        ans.push(sel.getRangeAt(i).cloneRange())
-    return ans
-
-
-def apply_cloned_selection(ranges):
-    sel = window.getSelection()
-    sel.removeAllRanges()
-    for r in ranges:
-        sel.addRange(r)
-    return sel
-
-
 if __name__ is '__main__':
     from pythonize import strings
     strings()