Refactor HTML syntax highlighter too keep track of tag nesting

This is needed for the eventual implementation of inline spellcheck
2025-07-09 03:04:10 -04:00 · 2014-02-19 15:12:53 +05:30 · 2014-02-19 15:12:53 +05:30 · a1a4585167
commit a1a4585167
parent 5481f1f820
3 changed files with 214 additions and 114 deletions
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -29,13 +29,15 @@ def run_loop(state, state_map, formats, text):

 class SyntaxHighlighter(QSyntaxHighlighter):

-    state_class = SimpleState
    state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
    create_formats_func = lambda highlighter: {}

    def __init__(self, *args, **kwargs):
        QSyntaxHighlighter.__init__(self, *args, **kwargs)

+    def create_state(self, num):
+        return SimpleState(max(0, num))
+
    def rehighlight(self):
        self.outlineexplorer_data = {}
        QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
@ -54,9 +56,7 @@ class SyntaxHighlighter(QSyntaxHighlighter):
        try:
            state = self.previousBlockState()
            self.setCurrentBlockUserData(None)  # Ensure that any stale user data is discarded
-            if state == -1:
-                state = 0
-            state = self.state_class(state)
+            state = self.create_state(state)
            state.get_user_data, state.set_user_data = self.currentBlockUserData, self.setCurrentBlockUserData
            for i, num, fmt in run_loop(state, self.state_map, self.formats, unicode(text)):
                if fmt is not None:
--- a/src/calibre/gui2/tweak_book/editor/syntax/css.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/css.py
@ -251,9 +251,11 @@ def create_formats(highlighter):
 class CSSHighlighter(SyntaxHighlighter):

    state_map = state_map
-    state_class = State
    create_formats_func = create_formats

+    def create_state(self, num):
+        return State(max(0, num))
+
 if __name__ == '__main__':
    from calibre.gui2.tweak_book.editor.widget import launch_editor
    launch_editor('''\
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -31,55 +31,170 @@ unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
 cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
 nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+')  # special spaces and hyphens

-class State(object):
-
-    ''' Store the parsing state, a stack of bold and italic formatting and the
-    last seen open tag, all in a single integer, so that it can be used with.
-    This assumes an int is at least 32 bits.'''
-
-    NORMAL = 0
-    IN_OPENING_TAG = 1
-    IN_CLOSING_TAG = 2
-    IN_COMMENT = 3
-    IN_PI = 4
-    IN_DOCTYPE = 5
-    ATTRIBUTE_NAME = 6
-    ATTRIBUTE_VALUE = 7
-    SQ_VAL = 8
-    DQ_VAL = 9
-    CDATA = 10
-    CSS = 11
-
-    TAGS = {x:i+1 for i, x in enumerate(cdata_tags | bold_tags | italic_tags)}
-    TAGS_RMAP = {v:k for k, v in TAGS.iteritems()}
-    UNKNOWN_TAG = '___'
-
-    def __init__(self, num):
-        self.parse  = num & 0b1111
-        self.bold   = (num >> 4) & 0b11111111
-        self.italic = (num >> 12) & 0b11111111
-        self.tag    = self.TAGS_RMAP.get(num >> 20, self.UNKNOWN_TAG)
-        self.css    = 0
-        if self.parse == State.CSS:
-            self.css = num >> 4
-
-    @property
-    def value(self):
-        if self.parse == State.CSS:
-            return ((self.parse & 0b1111) | (self.css << 4))
-        tag = self.TAGS.get(self.tag.lower(), 0)
-        return ((self.parse & 0b1111) |
-                ((max(0, self.bold) & 0b11111111) << 4) |
-                ((max(0, self.italic) & 0b11111111) << 12) |
-                (tag << 20))
-
-    def clear(self):
-        self.parse = self.bold = self.italic = self.css = 0
-        self.tag = self.UNKNOWN_TAG
+NORMAL = 0
+IN_OPENING_TAG = 1
+IN_CLOSING_TAG = 2
+IN_COMMENT = 3
+IN_PI = 4
+IN_DOCTYPE = 5
+ATTRIBUTE_NAME = 6
+ATTRIBUTE_VALUE = 7
+SQ_VAL = 8
+DQ_VAL = 9
+CDATA = 10
+CSS = 11

 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
 TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')

+class Tag(object):
+
+    __slots__ = ('name', 'bold', 'italic', 'lang', 'hash')
+
+    def __init__(self, name, bold=None, italic=None):
+        self.name = name
+        self.bold = name in bold_tags if bold is None else bold
+        self.italic = name in italic_tags if italic is None else italic
+        self.lang = None
+        self.hash = 0
+
+    def __hash__(self):
+        return self.hash
+
+    def __eq__(self, other):
+        return self.name == getattr(other, 'name', None) and self.lang == getattr(other, 'lang', False)
+
+    def copy(self):
+        ans = Tag(self.name, self.bold, self.italic)
+        ans.lang, ans.hash = self.lang, self.hash
+        return ans
+
+    def update_hash(self):
+        self.hash = hash((self.name, self.lang))
+
+class State(object):
+
+    __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
+                 'current_lang', 'parse', 'get_user_data', 'set_user_data',
+                 'css_formats', 'stack', 'sub_parser_state', 'default_lang')
+
+    def __init__(self):
+        self.tags = []
+        self.is_bold = self.is_italic = False
+        self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
+            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
+        self.parse = NORMAL
+
+    def copy(self):
+        ans = State()
+        for x in self.__slots__:
+            setattr(ans, x, getattr(self, x))
+        self.tags = [x.copy() for x in self.tags]
+        if self.tag_being_defined is not None:
+            self.tag_being_defined = self.tag_being_defined.copy()
+        return ans
+
+    @property
+    def value(self):
+        if self.tag_being_defined is not None:
+            self.tag_being_defined.update_hash()
+        return self.stack.index_for(self)
+
+    def __hash__(self):
+        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
+
+    def __eq__(self, other):
+        return (
+            self.parse == getattr(other, 'parse', -1) and
+            self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
+            self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
+            self.tags == getattr(other, 'tags', None)
+        )
+
+    def open_tag(self, name):
+        self.tag_being_defined = Tag(name)
+
+    def close_tag(self, name):
+        removed_tags = []
+        for tag in reversed(self.tags):
+            removed_tags.append(tag)
+            if tag.name == name:
+                break
+        else:
+            return  # No matching open tag found, ignore the closing tag
+        # Remove all tags upto the matching open tag
+        self.tags = self.tags[:-len(removed_tags)]
+        self.sub_parser_state = 0
+        # Check if we should still be bold or italic
+        if self.is_bold:
+            self.is_bold = False
+            for tag in reversed(self.tags):
+                if tag.bold:
+                    self.is_bold = True
+                    break
+        if self.is_italic:
+            self.is_italic = False
+            for tag in reversed(self.tags):
+                if tag.italic:
+                    self.is_italic = True
+                    break
+        # Set the current language to the first lang attribute in a still open tag
+        self.current_lang = None
+        for tag in reversed(self.tags):
+            if tag.lang is not None:
+                self.current_lang = tag.lang
+                break
+
+    def finish_opening_tag(self, cdata_tags):
+        self.parse = NORMAL
+        if self.tag_being_defined is None:
+            return
+        t, self.tag_being_defined = self.tag_being_defined, None
+        t.update_hash()
+        self.tags.append(t)
+        self.is_bold = self.is_bold or t.bold
+        self.is_italic = self.is_italic or t.italic
+        self.current_lang = t.lang or self.current_lang
+        if t.name in cdata_tags:
+            self.parse = CSS if t.name == 'style' else CDATA
+            self.sub_parser_state = 0
+
+    def __repr__(self):
+        return '<State %s is_bold=%s is_italic=%s current_lang=%s>' % (
+            '->'.join(x.name for x in self.tags), self.is_bold, self.is_italic, self.current_lang)
+    __str__ = __repr__
+
+class Stack(object):
+
+    ''' Maintain an efficient bi-directional mapping between states and index
+    numbers. Ensures that if state1 == state2 then their corresponding index
+    numbers are the same and vice versa. This is need so that the state number
+    passed to Qt does not change unless the underlying state has actually
+    changed. '''
+
+    def __init__(self):
+        self.index_map = []
+        self.state_map = {}
+
+    def index_for(self, state):
+        ans = self.state_map.get(state, None)
+        if ans is None:
+            self.state_map[state] = ans = len(self.index_map)
+            self.index_map.append(state)
+        return ans
+
+    def state_for(self, index):
+        try:
+            return self.index_map[index]
+        except IndexError:
+            return None
+
+class HTMLUserData(QTextBlockUserData):
+
+    def __init__(self):
+        QTextBlockUserData.__init__(self)
+        self.tags = []
+
 def add_tag_data(state, tag):
    ud = q = state.get_user_data()
    if ud is None:
@ -97,37 +212,38 @@ def css(state, text, i, formats):
    else:
        css_text = text[i:m.start()]
    ans = []
-    css_state = CSSState(state.css)
+    css_state = CSSState(state.sub_parser_state)
    for j, num, fmt in run_loop(css_state, css_state_map, state.css_formats, css_text):
        ans.append((num, fmt))
-    state.css = css_state.value
+    state.sub_parser_state = css_state.value
    if m is not None:
-        state.clear()
-        state.parse = State.IN_CLOSING_TAG
+        state.sub_parser_state = 0
+        state.parse = IN_CLOSING_TAG
        add_tag_data(state, TagStart(m.start(), 'style', '', True, True))
        ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])])
    return ans

 def cdata(state, text, i, formats):
    'CDATA inside tags like <title> or <style>'
-    pat = cdata_close_pats[state.tag]
+    name = state.tags[-1].name
+    pat = cdata_close_pats[name]
    m = pat.search(text, i)
-    fmt = formats['title' if state.tag == 'title' else 'special']
+    fmt = formats['title' if name == 'title' else 'special']
    if m is None:
        return [(len(text) - i, fmt)]
-    state.parse = State.IN_CLOSING_TAG
+    state.parse = IN_CLOSING_TAG
    num = m.start() - i
-    add_tag_data(state, TagStart(m.start(), state.tag, '', True, True))
+    add_tag_data(state, TagStart(m.start(), name, '', True, True))
    return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]

 def mark_nbsp(state, text, nbsp_format):
    ans = []
    fmt = None
-    if state.bold or state.italic:
+    if state.is_bold or state.is_italic:
        fmt = SyntaxTextCharFormat()
-        if state.bold:
+        if state.is_bold:
            fmt.setFontWeight(QFont.Bold)
-        if state.italic:
+        if state.is_italic:
            fmt.setFontItalic(True)
    last = 0
    for m in nbsp_pat.finditer(text):
@ -137,26 +253,20 @@ def mark_nbsp(state, text, nbsp_format):
        ans = [(len(text), fmt)]
    return ans

-class HTMLUserData(QTextBlockUserData):
-
-    def __init__(self):
-        QTextBlockUserData.__init__(self)
-        self.tags = []
-
 def normal(state, text, i, formats):
    ' The normal state in between tags '
    ch = text[i]
    if ch == '<':
        if text[i:i+4] == '<!--':
-            state.parse, fmt = state.IN_COMMENT, formats['comment']
+            state.parse, fmt = IN_COMMENT, formats['comment']
            return [(4, fmt)]

        if text[i:i+2] == '<?':
-            state.parse, fmt = state.IN_PI, formats['preproc']
+            state.parse, fmt = IN_PI, formats['preproc']
            return [(2, fmt)]

        if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'):
-            state.parse, fmt = state.IN_DOCTYPE, formats['preproc']
+            state.parse, fmt = IN_DOCTYPE, formats['preproc']
            return [(2, fmt)]

        m = tag_name_pat.match(text, i + 1)
@ -165,16 +275,16 @@ def normal(state, text, i, formats):

        name = m.group()
        closing = name.startswith('/')
-        state.parse = state.IN_CLOSING_TAG if closing else state.IN_OPENING_TAG
+        state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG
        ans = [(2 if closing else 1, formats['end_tag' if closing else 'tag'])]
        if closing:
            name = name[1:]
        prefix, name = name.partition(':')[0::2]
-        state.tag = name or prefix
        if prefix and name:
            ans.append((len(prefix)+1, formats['nsprefix']))
        ans.append((len(name or prefix), formats['tag_name']))
        add_tag_data(state, TagStart(i, prefix, name, closing, True))
+        (state.close_tag if closing else state.open_tag)(name or prefix)
        return ans

    if ch == '&':
@ -198,27 +308,18 @@ def opening_tag(cdata_tags, state, text, i, formats):
        m = self_closing_pat.match(text, i)
        if m is None:
            return [(1, formats['/'])]
-        state.parse = state.NORMAL
-        state.tag = State.UNKNOWN_TAG
+        state.parse = NORMAL
        l = len(m.group())
        add_tag_data(state, TagEnd(i + l - 1, True, False))
        return [(l, formats['tag'])]
    if ch == '>':
-        state.parse = state.NORMAL
-        tag = state.tag.lower()
-        if tag in cdata_tags:
-            state.parse = state.CDATA
-            if tag == 'style':
-                state.clear()
-                state.parse = state.CSS
-        state.bold += int(tag in bold_tags)
-        state.italic += int(tag in italic_tags)
+        state.finish_opening_tag(cdata_tags)
        add_tag_data(state, TagEnd(i, False, False))
        return [(1, formats['tag'])]
    m = attribute_name_pat.match(text, i)
    if m is None:
        return [(1, formats['?'])]
-    state.parse = state.ATTRIBUTE_NAME
+    state.parse = ATTRIBUTE_NAME
    prefix, name = m.group().partition(':')[0::2]
    if prefix and name:
        return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
@ -230,9 +331,9 @@ def attribute_name(state, text, i, formats):
    if ch in space_chars:
        return [(1, None)]
    if ch == '=':
-        state.parse = State.ATTRIBUTE_VALUE
+        state.parse = ATTRIBUTE_VALUE
        return [(1, formats['attr'])]
-    state.parse = State.IN_OPENING_TAG
+    state.parse = IN_OPENING_TAG
    if ch in {'>', '/'}:
        # Standalone attribute with no value
        return [(0, None)]
@ -244,9 +345,9 @@ def attribute_value(state, text, i, formats):
    if ch in space_chars:
        return [(1, None)]
    if ch in {'"', "'"}:
-        state.parse = State.SQ_VAL if ch == "'" else State.DQ_VAL
+        state.parse = SQ_VAL if ch == "'" else DQ_VAL
        return [(1, formats['string'])]
-    state.parse = State.IN_OPENING_TAG
+    state.parse = IN_OPENING_TAG
    m = unquoted_val_pat.match(text, i)
    if m is None:
        return [(1, formats['no-attr-value'])]
@ -254,13 +355,13 @@ def attribute_value(state, text, i, formats):

 def quoted_val(state, text, i, formats):
    ' A quoted attribute value '
-    quote = '"' if state.parse == State.DQ_VAL else "'"
+    quote = '"' if state.parse is DQ_VAL else "'"
    pos = text.find(quote, i)
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + 1
-        state.parse = State.IN_OPENING_TAG
+        state.parse = IN_OPENING_TAG
    return [(num, formats['string'])]

 def closing_tag(state, text, i, formats):
@ -271,48 +372,44 @@ def closing_tag(state, text, i, formats):
    pos = text.find('>', i)
    if pos == -1:
        return [(len(text) - i, formats['bad-closing'])]
-    state.parse = state.NORMAL
-    tag = state.tag.lower()
-    state.bold -= int(tag in bold_tags)
-    state.italic -= int(tag in italic_tags)
+    state.parse = NORMAL
    num = pos - i + 1
    ans = [(1, formats['end_tag'])]
    if num > 1:
        ans.insert(0, (num - 1, formats['bad-closing']))
-    state.tag = State.UNKNOWN_TAG
    add_tag_data(state, TagEnd(pos, False, False))
    return ans

 def in_comment(state, text, i, formats):
    ' Comment, processing instruction or doctype '
-    end = {state.IN_COMMENT:'-->', state.IN_PI:'?>'}.get(state.parse, '>')
+    end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>')
    pos = text.find(end, i)
-    fmt = formats['comment' if state.parse == state.IN_COMMENT else 'preproc']
+    fmt = formats['comment' if state.parse is IN_COMMENT else 'preproc']
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + len(end)
-        state.parse = state.NORMAL
+        state.parse = NORMAL
    return [(num, fmt)]

 state_map = {
-    State.NORMAL:normal,
-    State.IN_OPENING_TAG: partial(opening_tag, cdata_tags),
-    State.IN_CLOSING_TAG: closing_tag,
-    State.ATTRIBUTE_NAME: attribute_name,
-    State.ATTRIBUTE_VALUE: attribute_value,
-    State.CDATA: cdata,
-    State.CSS: css,
+    NORMAL:normal,
+    IN_OPENING_TAG: partial(opening_tag, cdata_tags),
+    IN_CLOSING_TAG: closing_tag,
+    ATTRIBUTE_NAME: attribute_name,
+    ATTRIBUTE_VALUE: attribute_value,
+    CDATA: cdata,
+    CSS: css,
 }

-for x in (State.IN_COMMENT, State.IN_PI, State.IN_DOCTYPE):
+for x in (IN_COMMENT, IN_PI, IN_DOCTYPE):
    state_map[x] = in_comment

-for x in (State.SQ_VAL, State.DQ_VAL):
+for x in (SQ_VAL, DQ_VAL):
    state_map[x] = quoted_val

 xml_state_map = state_map.copy()
-xml_state_map[State.IN_OPENING_TAG] = partial(opening_tag, set())
+xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set())

 def create_formats(highlighter):
    t = highlighter.theme
@ -349,18 +446,19 @@ def create_formats(highlighter):
 class HTMLHighlighter(SyntaxHighlighter):

    state_map = state_map
-    state_class = State
    create_formats_func = create_formats

    def create_formats(self):
        super(HTMLHighlighter, self).create_formats()
-        self.css_formats = create_css_formats(self)
-        self.state_class = self.create_state
+        self.default_state = State()
+        self.default_state.css_formats = create_css_formats(self)
+        self.default_state.stack = Stack()

    def create_state(self, val):
-        ans = State(val)
-        ans.css_formats = self.css_formats
-        return ans
+        if val < 0:
+            return self.default_state.copy()
+        ans = self.default_state.stack.state_for(val) or self.default_state
+        return ans.copy()

 class XMLHighlighter(HTMLHighlighter):