Implement text search for HTML files

2025-07-09 03:04:10 -04:00 · 2016-06-02 17:36:44 +05:30 · 2016-06-02 17:36:44 +05:30 · 298b664669
commit 298b664669
parent 3c27f28fdb
3 changed files with 92 additions and 4 deletions
--- a/src/calibre/gui2/tweak_book/editor/smarts/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py
@ -10,7 +10,7 @@ import sys, re
 from operator import itemgetter
 from cssutils import parseStyle
-from PyQt5.Qt import QTextEdit, Qt
+from PyQt5.Qt import QTextEdit, Qt, QTextCursor
 from calibre import prepare_string_for_xml, xml_entity_to_unicode
 from calibre.ebooks.oeb.polish.container import OEB_DOCS
@ -672,9 +672,62 @@ class Smarts(NullSmarts):
            return 'complete_names', (names_type, doc_name, c.root), query
    def find_text(self, pat, cursor):
        from calibre.gui2.tweak_book.text_search import find_text_in_chunks
        chunks = []
        c = QTextCursor(cursor)
        c.setPosition(0)
        in_text = True
        block = c.block()
        cstart = min(cursor.position(), cursor.anchor())
        cend = max(cursor.position(), cursor.anchor())
        def append(text, start):
            after = start + len(text)
            if start <= cend and cstart < after:
                extra = after - (cend + 1)
                if extra > 0:
                    text = text[:-extra]
                extra = cstart - start
                if extra > 0:
                    text = text[extra:]
                chunks.append((text, start + max(extra, 0)))
        while block.isValid() and block.position() <= cend and block.position() + block.length() > cstart:
            boundaries = sorted(block.userData().tags, key=get_offset)
            if not boundaries:
                # Add the whole line
                if in_text:
                    text = block.text()
                    if text:
                        append(text, block.position())
            else:
                start = block.position()
                c.setPosition(start)
                for b in boundaries:
                    if in_text:
                        c.setPosition(start + b.offset, c.KeepAnchor)
                        if c.hasSelection():
                            append(c.selectedText(), c.anchor())
                    in_text = not b.is_start
                    c.setPosition(start + b.offset + 1)
                if in_text:
                    # Add remaining text in block
                    c.setPosition(block.position() + boundaries[-1].offset + 1)
                    c.movePosition(c.EndOfBlock, c.KeepAnchor)
                    if c.hasSelection():
                        append(c.selectedText(), c.anchor())
            block = block.next()
        s, e = find_text_in_chunks(pat, chunks)
        return s != -1 and e != -1, s, e
 if __name__ == '__main__':  # {{{
    from calibre.gui2.tweak_book.editor.widget import launch_editor
-    launch_editor('''\
+    if sys.argv[-1].endswith('.html'):
        raw = lopen(sys.argv[-1], 'rb').read().decode('utf-8')
    else:
        raw = '''\
 <!DOCTYPE html>
 <html xml:lang="en" lang="en">
 <!--
@ -703,5 +756,9 @@ if __name__ == '__main__':  # {{{
        <p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p>
    </body>
 </html>
-''', path_is_raw=True, syntax='xml')
+'''
    def callback(ed):
        import regex
        ed.find_text(regex.compile('A bold word'))
    launch_editor(raw, path_is_raw=True, syntax='html', callback=callback)
 # }}}
--- a/src/calibre/gui2/tweak_book/editor/text.py
+++ b/src/calibre/gui2/tweak_book/editor/text.py
@ -380,6 +380,7 @@ class TextEdit(PlainTextEdit):
            pos = c.End if reverse else c.Start
        c.movePosition(pos, c.KeepAnchor)
        if hasattr(self.smarts, 'find_text'):
            self.highlighter.join()
            found, start, end = self.smarts.find_text(pat, c)
            if not found:
                return False
--- a/src/calibre/gui2/tweak_book/text_search.py
+++ b/src/calibre/gui2/tweak_book/text_search.py
@ -166,7 +166,7 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
        for fname, syntax in files.iteritems():
            ed = editors.get(fname, None)
            if ed is not None:
-                if ed.find_text(pat, complete=True, save_match='gui'):
+                if ed.find_text(pat, complete=True):
                    show_editor(fname)
                    return True
            else:
@ -182,3 +182,33 @@ def run_text_search(search, current_editor, current_editor_name, searchable_name
    msg = '<p>' + _('No matches were found for %s') % ('<pre style="font-style:italic">' + prepare_string_for_xml(search['find']) + '</pre>')
    return error_dialog(gui_parent, _('Not found'), msg, show=True)
 def find_text_in_chunks(pat, chunks):
    text = ''.join(x[0] for x in chunks)
    m = pat.search(text)
    if m is None:
        return -1, -1
    start, after = m.span()
    def contains(clen, pt):
        return offset <= pt < offset + clen
    offset = 0
    start_pos = end_pos = None
    for chunk, chunk_start in chunks:
        clen = len(chunk)
        if offset + clen < start:
            offset += clen
            continue  # this chunk ends before start
        if start_pos is None:
            if contains(clen, start):
                start_pos = chunk_start + (start - offset)
        if start_pos is not None:
            if contains(clen, after-1):
                end_pos = chunk_start + (after - offset)
                return start_pos, end_pos
        offset += clen
        if offset > after:
            break  # the next chunk starts after end
    return -1, -1