Implement text search for HTML files

2026-02-01 00:23:30 -05:00 · 2016-06-02 17:36:44 +05:30 · 2016-06-02 17:36:44 +05:30 · 298b664669
commit 298b664669
parent 3c27f28fdb
3 changed files with 92 additions and 4 deletions
--- a/src/calibre/gui2/tweak_book/editor/smarts/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py
@ -10,7 +10,7 @@ import sys, re
 from operator import itemgetter

 from cssutils import parseStyle
-from PyQt5.Qt import QTextEdit, Qt
+from PyQt5.Qt import QTextEdit, Qt, QTextCursor

 from calibre import prepare_string_for_xml, xml_entity_to_unicode
 from calibre.ebooks.oeb.polish.container import OEB_DOCS
@ -672,9 +672,62 @@ class Smarts(NullSmarts):

            return 'complete_names', (names_type, doc_name, c.root), query

+    def find_text(self, pat, cursor):
+        from calibre.gui2.tweak_book.text_search import find_text_in_chunks
+        chunks = []
+        c = QTextCursor(cursor)
+        c.setPosition(0)
+        in_text = True
+        block = c.block()
+
+        cstart = min(cursor.position(), cursor.anchor())
+        cend = max(cursor.position(), cursor.anchor())
+
+        def append(text, start):
+            after = start + len(text)
+            if start <= cend and cstart < after:
+                extra = after - (cend + 1)
+                if extra > 0:
+                    text = text[:-extra]
+                extra = cstart - start
+                if extra > 0:
+                    text = text[extra:]
+                chunks.append((text, start + max(extra, 0)))
+
+        while block.isValid() and block.position() <= cend and block.position() + block.length() > cstart:
+            boundaries = sorted(block.userData().tags, key=get_offset)
+            if not boundaries:
+                # Add the whole line
+                if in_text:
+                    text = block.text()
+                    if text:
+                        append(text, block.position())
+            else:
+                start = block.position()
+                c.setPosition(start)
+                for b in boundaries:
+                    if in_text:
+                        c.setPosition(start + b.offset, c.KeepAnchor)
+                        if c.hasSelection():
+                            append(c.selectedText(), c.anchor())
+                    in_text = not b.is_start
+                    c.setPosition(start + b.offset + 1)
+                if in_text:
+                    # Add remaining text in block
+                    c.setPosition(block.position() + boundaries[-1].offset + 1)
+                    c.movePosition(c.EndOfBlock, c.KeepAnchor)
+                    if c.hasSelection():
+                        append(c.selectedText(), c.anchor())
+            block = block.next()
+        s, e = find_text_in_chunks(pat, chunks)
+        return s != -1 and e != -1, s, e
+
 if __name__ == '__main__':  # {{{
    from calibre.gui2.tweak_book.editor.widget import launch_editor
-    launch_editor('''\
+    if sys.argv[-1].endswith('.html'):
+        raw = lopen(sys.argv[-1], 'rb').read().decode('utf-8')
+    else:
+        raw = '''\
 <!DOCTYPE html>
 <html xml:lang="en" lang="en">
 <!--
@ -703,5 +756,9 @@ if __name__ == '__main__':  # {{{
        <p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p>
    </body>
 </html>
-''', path_is_raw=True, syntax='xml')
+'''
+    def callback(ed):
+        import regex
+        ed.find_text(regex.compile('A bold word'))
+    launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
 # }}}
--- a/src/calibre/gui2/tweak_book/editor/text.py
+++ b/src/calibre/gui2/tweak_book/editor/text.py
@ -380,6 +380,7 @@ class TextEdit(PlainTextEdit):
            pos = c.End if reverse else c.Start
        c.movePosition(pos, c.KeepAnchor)
        if hasattr(self.smarts, 'find_text'):
+            self.highlighter.join()
            found, start, end = self.smarts.find_text(pat, c)
            if not found:
                return False
--- a/src/calibre/gui2/tweak_book/text_search.py
+++ b/src/calibre/gui2/tweak_book/text_search.py
@ -166,7 +166,7 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
        for fname, syntax in files.iteritems():
            ed = editors.get(fname, None)
            if ed is not None:
-                if ed.find_text(pat, complete=True, save_match='gui'):
+                if ed.find_text(pat, complete=True):
                    show_editor(fname)
                    return True
            else:
@ -182,3 +182,33 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name

    msg = '<p>' + _('No matches were found for %s') % ('<pre style="font-style:italic">' + prepare_string_for_xml(search['find']) + '</pre>')
    return error_dialog(gui_parent, _('Not found'), msg, show=True)
+
+def find_text_in_chunks(pat, chunks):
+    text = ''.join(x[0] for x in chunks)
+    m = pat.search(text)
+    if m is None:
+        return -1, -1
+    start, after = m.span()
+
+    def contains(clen, pt):
+        return offset <= pt < offset + clen
+
+    offset = 0
+    start_pos = end_pos = None
+
+    for chunk, chunk_start in chunks:
+        clen = len(chunk)
+        if offset + clen < start:
+            offset += clen
+            continue  # this chunk ends before start
+        if start_pos is None:
+            if contains(clen, start):
+                start_pos = chunk_start + (start - offset)
+        if start_pos is not None:
+            if contains(clen, after-1):
+                end_pos = chunk_start + (after - offset)
+                return start_pos, end_pos
+        offset += clen
+        if offset > after:
+            break  # the next chunk starts after end
+    return -1, -1