Edit Book: Fix the "Search ignoring markup tool" not ignoring comments/processing instructions, etc. Fixes #1651160 [Search ignoring HTML markup finds within <?xml directive and HTML comments](https://bugs.launchpad.net/calibre/+bug/1651160)

2025-07-08 10:44:09 -04:00 · 2016-12-20 11:23:20 +05:30 · 2016-12-20 11:23:20 +05:30 · 306bb0ff2f
commit 306bb0ff2f
parent 7199d30fa1
2 changed files with 24 additions and 2 deletions
--- a/src/calibre/gui2/tweak_book/editor/smarts/html.py
+++ b/src/calibre/gui2/tweak_book/editor/smarts/html.py
@ -8,6 +8,7 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

 import sys, re
 from operator import itemgetter
+from itertools import chain

 from cssutils import parseStyle
 from PyQt5.Qt import QTextEdit, Qt, QTextCursor
@ -700,6 +701,16 @@ class Smarts(NullSmarts):
        c.setPosition(cstart)
        block = c.block()
        in_text = find_tag_definition(block, 0)[0] is None
+        if in_text:
+            # Check if we are in comment/PI/etc.
+            pb = block.previous()
+            while pb.isValid():
+                boundaries = pb.userData().non_tag_structures
+                if boundaries:
+                    if boundaries[-1].is_start:
+                        in_text = False
+                    break
+                pb = pb.previous()

        def append(text, start):
            text = text.replace(PARAGRAPH_SEPARATOR, '\n')
@ -714,7 +725,8 @@ class Smarts(NullSmarts):
                chunks.append((text, start + max(extra, 0)))

        while block.isValid() and block.position() <= cend:
-            boundaries = sorted(block.userData().tags, key=get_offset)
+            ud = block.userData()
+            boundaries = sorted(chain(ud.tags, ud.non_tag_structures), key=get_offset)
            if not boundaries:
                # Add the whole line
                if in_text:
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -51,6 +51,7 @@ CSS = 11

 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
 TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
+NonTagBoundary = namedtuple('NonTagBoundary', 'offset is_start type')
 Attr = namedtuple('Attr', 'offset type data')

 LINK_ATTRS = frozenset(('href', 'src', 'poster', 'xlink:href'))
@ -62,6 +63,7 @@ def refresh_spell_check_status():
    global do_spell_check
    do_spell_check = tprefs['inline_spell_check'] and hasattr(dictionaries, 'active_user_dictionaries')

+
 from calibre.constants import plugins

 _speedup = plugins['html'][0]
@ -223,12 +225,13 @@ class HTMLUserData(QTextBlockUserData):
        QTextBlockUserData.__init__(self)
        self.tags = []
        self.attributes = []
+        self.non_tag_structures = []
        self.state = State()
        self.css_user_data = None
        self.doc_name = None

    def clear(self, state=None, doc_name=None):
-        self.tags, self.attributes = [], []
+        self.tags, self.attributes, self.non_tag_structures = [], [], []
        self.state = State() if state is None else state
        self.doc_name = doc_name

@ -247,6 +250,7 @@ class XMLUserData(HTMLUserData):
 def add_tag_data(user_data, tag):
    user_data.tags.append(tag)

+
 ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object()


@ -333,14 +337,17 @@ def normal(state, text, i, formats, user_data):
    if ch == '<':
        if text[i:i+4] == '<!--':
            state.parse, fmt = IN_COMMENT, formats['comment']
+            user_data.non_tag_structures.append(NonTagBoundary(i, True, IN_COMMENT))
            return [(4, fmt)]

        if text[i:i+2] == '<?':
            state.parse, fmt = IN_PI, formats['preproc']
+            user_data.non_tag_structures.append(NonTagBoundary(i, True, IN_PI))
            return [(2, fmt)]

        if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'):
            state.parse, fmt = IN_DOCTYPE, formats['preproc']
+            user_data.non_tag_structures.append(NonTagBoundary(i, True, IN_DOCTYPE))
            return [(2, fmt)]

        m = tag_name_pat.match(text, i + 1)
@ -497,10 +504,12 @@ def in_comment(state, text, i, formats, user_data):
    if pos == -1:
        num = len(text) - i
    else:
+        user_data.non_tag_structures.append(NonTagBoundary(pos, False, state.parse))
        num = pos - i + len(end)
        state.parse = NORMAL
    return [(num, fmt)]

+
 state_map = {
    NORMAL:normal,
    IN_OPENING_TAG: partial(opening_tag, cdata_tags),
@ -616,6 +625,7 @@ def profile():
    del doc
    del app

+
 if __name__ == '__main__':
    from calibre.gui2.tweak_book.editor.widget import launch_editor
    launch_editor('''\