Refactor HTML syntax highlighter too keep track of tag nesting

This is needed for the eventual implementation of inline spellcheck
2025-07-09 03:04:10 -04:00 · 2014-02-19 15:12:53 +05:30 · 2014-02-19 15:12:53 +05:30 · a1a4585167
commit a1a4585167
parent 5481f1f820
3 changed files with 214 additions and 114 deletions
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -29,13 +29,15 @@ def run_loop(state, state_map, formats, text):
 class SyntaxHighlighter(QSyntaxHighlighter):
    state_class = SimpleState
    state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
    create_formats_func = lambda highlighter: {}
    def __init__(self, *args, **kwargs):
        QSyntaxHighlighter.__init__(self, *args, **kwargs)
    def create_state(self, num):
        return SimpleState(max(0, num))
    def rehighlight(self):
        self.outlineexplorer_data = {}
        QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
@ -54,9 +56,7 @@ class SyntaxHighlighter(QSyntaxHighlighter):
        try:
            state = self.previousBlockState()
            self.setCurrentBlockUserData(None)  # Ensure that any stale user data is discarded
-            if state == -1:
+            state = self.create_state(state)
                state = 0
            state = self.state_class(state)
            state.get_user_data, state.set_user_data = self.currentBlockUserData, self.setCurrentBlockUserData
            for i, num, fmt in run_loop(state, self.state_map, self.formats, unicode(text)):
                if fmt is not None:
--- a/src/calibre/gui2/tweak_book/editor/syntax/css.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/css.py
@ -251,9 +251,11 @@ def create_formats(highlighter):
 class CSSHighlighter(SyntaxHighlighter):
    state_map = state_map
    state_class = State
    create_formats_func = create_formats
    def create_state(self, num):
        return State(max(0, num))
 if __name__ == '__main__':
    from calibre.gui2.tweak_book.editor.widget import launch_editor
    launch_editor('''\
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -31,55 +31,170 @@ unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
 cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
 nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+')  # special spaces and hyphens
-class State(object):
+NORMAL = 0
-
+IN_OPENING_TAG = 1
-    ''' Store the parsing state, a stack of bold and italic formatting and the
+IN_CLOSING_TAG = 2
-    last seen open tag, all in a single integer, so that it can be used with.
+IN_COMMENT = 3
-    This assumes an int is at least 32 bits.'''
+IN_PI = 4
-
+IN_DOCTYPE = 5
-    NORMAL = 0
+ATTRIBUTE_NAME = 6
-    IN_OPENING_TAG = 1
+ATTRIBUTE_VALUE = 7
-    IN_CLOSING_TAG = 2
+SQ_VAL = 8
-    IN_COMMENT = 3
+DQ_VAL = 9
-    IN_PI = 4
+CDATA = 10
-    IN_DOCTYPE = 5
+CSS = 11
    ATTRIBUTE_NAME = 6
    ATTRIBUTE_VALUE = 7
    SQ_VAL = 8
    DQ_VAL = 9
    CDATA = 10
    CSS = 11
    TAGS = {x:i+1 for i, x in enumerate(cdata_tags | bold_tags | italic_tags)}
    TAGS_RMAP = {v:k for k, v in TAGS.iteritems()}
    UNKNOWN_TAG = '___'
    def __init__(self, num):
        self.parse  = num & 0b1111
        self.bold   = (num >> 4) & 0b11111111
        self.italic = (num >> 12) & 0b11111111
        self.tag    = self.TAGS_RMAP.get(num >> 20, self.UNKNOWN_TAG)
        self.css    = 0
        if self.parse == State.CSS:
            self.css = num >> 4
    @property
    def value(self):
        if self.parse == State.CSS:
            return ((self.parse & 0b1111) | (self.css << 4))
        tag = self.TAGS.get(self.tag.lower(), 0)
        return ((self.parse & 0b1111) |
                ((max(0, self.bold) & 0b11111111) << 4) |
                ((max(0, self.italic) & 0b11111111) << 12) |
                (tag << 20))
    def clear(self):
        self.parse = self.bold = self.italic = self.css = 0
        self.tag = self.UNKNOWN_TAG
 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
 TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
 class Tag(object):
    __slots__ = ('name', 'bold', 'italic', 'lang', 'hash')
    def __init__(self, name, bold=None, italic=None):
        self.name = name
        self.bold = name in bold_tags if bold is None else bold
        self.italic = name in italic_tags if italic is None else italic
        self.lang = None
        self.hash = 0
    def __hash__(self):
        return self.hash
    def __eq__(self, other):
        return self.name == getattr(other, 'name', None) and self.lang == getattr(other, 'lang', False)
    def copy(self):
        ans = Tag(self.name, self.bold, self.italic)
        ans.lang, ans.hash = self.lang, self.hash
        return ans
    def update_hash(self):
        self.hash = hash((self.name, self.lang))
 class State(object):
    __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
                 'current_lang', 'parse', 'get_user_data', 'set_user_data',
                 'css_formats', 'stack', 'sub_parser_state', 'default_lang')
    def __init__(self):
        self.tags = []
        self.is_bold = self.is_italic = False
        self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
        self.parse = NORMAL
    def copy(self):
        ans = State()
        for x in self.__slots__:
            setattr(ans, x, getattr(self, x))
        self.tags = [x.copy() for x in self.tags]
        if self.tag_being_defined is not None:
            self.tag_being_defined = self.tag_being_defined.copy()
        return ans
    @property
    def value(self):
        if self.tag_being_defined is not None:
            self.tag_being_defined.update_hash()
        return self.stack.index_for(self)
    def __hash__(self):
        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
    def __eq__(self, other):
        return (
            self.parse == getattr(other, 'parse', -1) and
            self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
            self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
            self.tags == getattr(other, 'tags', None)
        )
    def open_tag(self, name):
        self.tag_being_defined = Tag(name)
    def close_tag(self, name):
        removed_tags = []
        for tag in reversed(self.tags):
            removed_tags.append(tag)
            if tag.name == name:
                break
        else:
            return  # No matching open tag found, ignore the closing tag
        # Remove all tags upto the matching open tag
        self.tags = self.tags[:-len(removed_tags)]
        self.sub_parser_state = 0
        # Check if we should still be bold or italic
        if self.is_bold:
            self.is_bold = False
            for tag in reversed(self.tags):
                if tag.bold:
                    self.is_bold = True
                    break
        if self.is_italic:
            self.is_italic = False
            for tag in reversed(self.tags):
                if tag.italic:
                    self.is_italic = True
                    break
        # Set the current language to the first lang attribute in a still open tag
        self.current_lang = None
        for tag in reversed(self.tags):
            if tag.lang is not None:
                self.current_lang = tag.lang
                break
    def finish_opening_tag(self, cdata_tags):
        self.parse = NORMAL
        if self.tag_being_defined is None:
            return
        t, self.tag_being_defined = self.tag_being_defined, None
        t.update_hash()
        self.tags.append(t)
        self.is_bold = self.is_bold or t.bold
        self.is_italic = self.is_italic or t.italic
        self.current_lang = t.lang or self.current_lang
        if t.name in cdata_tags:
            self.parse = CSS if t.name == 'style' else CDATA
            self.sub_parser_state = 0
    def __repr__(self):
        return '<State %s is_bold=%s is_italic=%s current_lang=%s>' % (
            '->'.join(x.name for x in self.tags), self.is_bold, self.is_italic, self.current_lang)
    __str__ = __repr__
 class Stack(object):
    ''' Maintain an efficient bi-directional mapping between states and index
    numbers. Ensures that if state1 == state2 then their corresponding index
    numbers are the same and vice versa. This is need so that the state number
    passed to Qt does not change unless the underlying state has actually
    changed. '''
    def __init__(self):
        self.index_map = []
        self.state_map = {}
    def index_for(self, state):
        ans = self.state_map.get(state, None)
        if ans is None:
            self.state_map[state] = ans = len(self.index_map)
            self.index_map.append(state)
        return ans
    def state_for(self, index):
        try:
            return self.index_map[index]
        except IndexError:
            return None
 class HTMLUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.tags = []
 def add_tag_data(state, tag):
    ud = q = state.get_user_data()
    if ud is None:
@ -97,37 +212,38 @@ def css(state, text, i, formats):
    else:
        css_text = text[i:m.start()]
    ans = []
-    css_state = CSSState(state.css)
+    css_state = CSSState(state.sub_parser_state)
    for j, num, fmt in run_loop(css_state, css_state_map, state.css_formats, css_text):
        ans.append((num, fmt))
-    state.css = css_state.value
+    state.sub_parser_state = css_state.value
    if m is not None:
-        state.clear()
+        state.sub_parser_state = 0
-        state.parse = State.IN_CLOSING_TAG
+        state.parse = IN_CLOSING_TAG
        add_tag_data(state, TagStart(m.start(), 'style', '', True, True))
        ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])])
    return ans
 def cdata(state, text, i, formats):
    'CDATA inside tags like <title> or <style>'
-    pat = cdata_close_pats[state.tag]
+    name = state.tags[-1].name
    pat = cdata_close_pats[name]
    m = pat.search(text, i)
-    fmt = formats['title' if state.tag == 'title' else 'special']
+    fmt = formats['title' if name == 'title' else 'special']
    if m is None:
        return [(len(text) - i, fmt)]
-    state.parse = State.IN_CLOSING_TAG
+    state.parse = IN_CLOSING_TAG
    num = m.start() - i
-    add_tag_data(state, TagStart(m.start(), state.tag, '', True, True))
+    add_tag_data(state, TagStart(m.start(), name, '', True, True))
    return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]
 def mark_nbsp(state, text, nbsp_format):
    ans = []
    fmt = None
-    if state.bold or state.italic:
+    if state.is_bold or state.is_italic:
        fmt = SyntaxTextCharFormat()
-        if state.bold:
+        if state.is_bold:
            fmt.setFontWeight(QFont.Bold)
-        if state.italic:
+        if state.is_italic:
            fmt.setFontItalic(True)
    last = 0
    for m in nbsp_pat.finditer(text):
@ -137,26 +253,20 @@ def mark_nbsp(state, text, nbsp_format):
        ans = [(len(text), fmt)]
    return ans
 class HTMLUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.tags = []
 def normal(state, text, i, formats):
    ' The normal state in between tags '
    ch = text[i]
    if ch == '<':
        if text[i:i+4] == '<!--':
-            state.parse, fmt = state.IN_COMMENT, formats['comment']
+            state.parse, fmt = IN_COMMENT, formats['comment']
            return [(4, fmt)]
        if text[i:i+2] == '<?':
-            state.parse, fmt = state.IN_PI, formats['preproc']
+            state.parse, fmt = IN_PI, formats['preproc']
            return [(2, fmt)]
        if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'):
-            state.parse, fmt = state.IN_DOCTYPE, formats['preproc']
+            state.parse, fmt = IN_DOCTYPE, formats['preproc']
            return [(2, fmt)]
        m = tag_name_pat.match(text, i + 1)
@ -165,16 +275,16 @@ def normal(state, text, i, formats):
        name = m.group()
        closing = name.startswith('/')
-        state.parse = state.IN_CLOSING_TAG if closing else state.IN_OPENING_TAG
+        state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG
        ans = [(2 if closing else 1, formats['end_tag' if closing else 'tag'])]
        if closing:
            name = name[1:]
        prefix, name = name.partition(':')[0::2]
        state.tag = name or prefix
        if prefix and name:
            ans.append((len(prefix)+1, formats['nsprefix']))
        ans.append((len(name or prefix), formats['tag_name']))
        add_tag_data(state, TagStart(i, prefix, name, closing, True))
        (state.close_tag if closing else state.open_tag)(name or prefix)
        return ans
    if ch == '&':
@ -198,27 +308,18 @@ def opening_tag(cdata_tags, state, text, i, formats):
        m = self_closing_pat.match(text, i)
        if m is None:
            return [(1, formats['/'])]
-        state.parse = state.NORMAL
+        state.parse = NORMAL
        state.tag = State.UNKNOWN_TAG
        l = len(m.group())
        add_tag_data(state, TagEnd(i + l - 1, True, False))
        return [(l, formats['tag'])]
    if ch == '>':
-        state.parse = state.NORMAL
+        state.finish_opening_tag(cdata_tags)
        tag = state.tag.lower()
        if tag in cdata_tags:
            state.parse = state.CDATA
            if tag == 'style':
                state.clear()
                state.parse = state.CSS
        state.bold += int(tag in bold_tags)
        state.italic += int(tag in italic_tags)
        add_tag_data(state, TagEnd(i, False, False))
        return [(1, formats['tag'])]
    m = attribute_name_pat.match(text, i)
    if m is None:
        return [(1, formats['?'])]
-    state.parse = state.ATTRIBUTE_NAME
+    state.parse = ATTRIBUTE_NAME
    prefix, name = m.group().partition(':')[0::2]
    if prefix and name:
        return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
@ -230,9 +331,9 @@ def attribute_name(state, text, i, formats):
    if ch in space_chars:
        return [(1, None)]
    if ch == '=':
-        state.parse = State.ATTRIBUTE_VALUE
+        state.parse = ATTRIBUTE_VALUE
        return [(1, formats['attr'])]
-    state.parse = State.IN_OPENING_TAG
+    state.parse = IN_OPENING_TAG
    if ch in {'>', '/'}:
        # Standalone attribute with no value
        return [(0, None)]
@ -244,9 +345,9 @@ def attribute_value(state, text, i, formats):
    if ch in space_chars:
        return [(1, None)]
    if ch in {'"', "'"}:
-        state.parse = State.SQ_VAL if ch == "'" else State.DQ_VAL
+        state.parse = SQ_VAL if ch == "'" else DQ_VAL
        return [(1, formats['string'])]
-    state.parse = State.IN_OPENING_TAG
+    state.parse = IN_OPENING_TAG
    m = unquoted_val_pat.match(text, i)
    if m is None:
        return [(1, formats['no-attr-value'])]
@ -254,13 +355,13 @@ def attribute_value(state, text, i, formats):
 def quoted_val(state, text, i, formats):
    ' A quoted attribute value '
-    quote = '"' if state.parse == State.DQ_VAL else "'"
+    quote = '"' if state.parse is DQ_VAL else "'"
    pos = text.find(quote, i)
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + 1
-        state.parse = State.IN_OPENING_TAG
+        state.parse = IN_OPENING_TAG
    return [(num, formats['string'])]
 def closing_tag(state, text, i, formats):
@ -271,48 +372,44 @@ def closing_tag(state, text, i, formats):
    pos = text.find('>', i)
    if pos == -1:
        return [(len(text) - i, formats['bad-closing'])]
-    state.parse = state.NORMAL
+    state.parse = NORMAL
    tag = state.tag.lower()
    state.bold -= int(tag in bold_tags)
    state.italic -= int(tag in italic_tags)
    num = pos - i + 1
    ans = [(1, formats['end_tag'])]
    if num > 1:
        ans.insert(0, (num - 1, formats['bad-closing']))
    state.tag = State.UNKNOWN_TAG
    add_tag_data(state, TagEnd(pos, False, False))
    return ans
 def in_comment(state, text, i, formats):
    ' Comment, processing instruction or doctype '
-    end = {state.IN_COMMENT:'-->', state.IN_PI:'?>'}.get(state.parse, '>')
+    end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>')
    pos = text.find(end, i)
-    fmt = formats['comment' if state.parse == state.IN_COMMENT else 'preproc']
+    fmt = formats['comment' if state.parse is IN_COMMENT else 'preproc']
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + len(end)
-        state.parse = state.NORMAL
+        state.parse = NORMAL
    return [(num, fmt)]
 state_map = {
-    State.NORMAL:normal,
+    NORMAL:normal,
-    State.IN_OPENING_TAG: partial(opening_tag, cdata_tags),
+    IN_OPENING_TAG: partial(opening_tag, cdata_tags),
-    State.IN_CLOSING_TAG: closing_tag,
+    IN_CLOSING_TAG: closing_tag,
-    State.ATTRIBUTE_NAME: attribute_name,
+    ATTRIBUTE_NAME: attribute_name,
-    State.ATTRIBUTE_VALUE: attribute_value,
+    ATTRIBUTE_VALUE: attribute_value,
-    State.CDATA: cdata,
+    CDATA: cdata,
-    State.CSS: css,
+    CSS: css,
 }
-for x in (State.IN_COMMENT, State.IN_PI, State.IN_DOCTYPE):
+for x in (IN_COMMENT, IN_PI, IN_DOCTYPE):
    state_map[x] = in_comment
-for x in (State.SQ_VAL, State.DQ_VAL):
+for x in (SQ_VAL, DQ_VAL):
    state_map[x] = quoted_val
 xml_state_map = state_map.copy()
-xml_state_map[State.IN_OPENING_TAG] = partial(opening_tag, set())
+xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set())
 def create_formats(highlighter):
    t = highlighter.theme
@ -349,18 +446,19 @@ def create_formats(highlighter):
 class HTMLHighlighter(SyntaxHighlighter):
    state_map = state_map
    state_class = State
    create_formats_func = create_formats
    def create_formats(self):
        super(HTMLHighlighter, self).create_formats()
-        self.css_formats = create_css_formats(self)
+        self.default_state = State()
-        self.state_class = self.create_state
+        self.default_state.css_formats = create_css_formats(self)
        self.default_state.stack = Stack()
    def create_state(self, val):
-        ans = State(val)
+        if val < 0:
-        ans.css_formats = self.css_formats
+            return self.default_state.copy()
-        return ans
+        ans = self.default_state.stack.state_for(val) or self.default_state
        return ans.copy()
 class XMLHighlighter(HTMLHighlighter):