From a1a45851677273a85b76cc090f52622ad7f395b2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 19 Feb 2014 15:12:53 +0530 Subject: [PATCH] Refactor HTML syntax highlighter too keep track of tag nesting This is needed for the eventual implementation of inline spellcheck --- .../gui2/tweak_book/editor/syntax/base.py | 8 +- .../gui2/tweak_book/editor/syntax/css.py | 4 +- .../gui2/tweak_book/editor/syntax/html.py | 316 ++++++++++++------ 3 files changed, 214 insertions(+), 114 deletions(-) diff --git a/src/calibre/gui2/tweak_book/editor/syntax/base.py b/src/calibre/gui2/tweak_book/editor/syntax/base.py index da01e1db20..a25cdf591a 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/base.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py @@ -29,13 +29,15 @@ def run_loop(state, state_map, formats, text): class SyntaxHighlighter(QSyntaxHighlighter): - state_class = SimpleState state_map = {0:lambda state, text, i, formats:[(len(text), None)]} create_formats_func = lambda highlighter: {} def __init__(self, *args, **kwargs): QSyntaxHighlighter.__init__(self, *args, **kwargs) + def create_state(self, num): + return SimpleState(max(0, num)) + def rehighlight(self): self.outlineexplorer_data = {} QApplication.setOverrideCursor(QCursor(Qt.WaitCursor)) @@ -54,9 +56,7 @@ class SyntaxHighlighter(QSyntaxHighlighter): try: state = self.previousBlockState() self.setCurrentBlockUserData(None) # Ensure that any stale user data is discarded - if state == -1: - state = 0 - state = self.state_class(state) + state = self.create_state(state) state.get_user_data, state.set_user_data = self.currentBlockUserData, self.setCurrentBlockUserData for i, num, fmt in run_loop(state, self.state_map, self.formats, unicode(text)): if fmt is not None: diff --git a/src/calibre/gui2/tweak_book/editor/syntax/css.py b/src/calibre/gui2/tweak_book/editor/syntax/css.py index b3cca45824..d5d82e19c6 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/css.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/css.py @@ -251,9 +251,11 @@ def create_formats(highlighter): class CSSHighlighter(SyntaxHighlighter): state_map = state_map - state_class = State create_formats_func = create_formats + def create_state(self, num): + return State(max(0, num)) + if __name__ == '__main__': from calibre.gui2.tweak_book.editor.widget import launch_editor launch_editor('''\ diff --git a/src/calibre/gui2/tweak_book/editor/syntax/html.py b/src/calibre/gui2/tweak_book/editor/syntax/html.py index 5890481323..a85949d289 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/html.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py @@ -31,55 +31,170 @@ unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars) cdata_close_pats = {x:re.compile(r'> 4) & 0b11111111 - self.italic = (num >> 12) & 0b11111111 - self.tag = self.TAGS_RMAP.get(num >> 20, self.UNKNOWN_TAG) - self.css = 0 - if self.parse == State.CSS: - self.css = num >> 4 - - @property - def value(self): - if self.parse == State.CSS: - return ((self.parse & 0b1111) | (self.css << 4)) - tag = self.TAGS.get(self.tag.lower(), 0) - return ((self.parse & 0b1111) | - ((max(0, self.bold) & 0b11111111) << 4) | - ((max(0, self.italic) & 0b11111111) << 12) | - (tag << 20)) - - def clear(self): - self.parse = self.bold = self.italic = self.css = 0 - self.tag = self.UNKNOWN_TAG +NORMAL = 0 +IN_OPENING_TAG = 1 +IN_CLOSING_TAG = 2 +IN_COMMENT = 3 +IN_PI = 4 +IN_DOCTYPE = 5 +ATTRIBUTE_NAME = 6 +ATTRIBUTE_VALUE = 7 +SQ_VAL = 8 +DQ_VAL = 9 +CDATA = 10 +CSS = 11 TagStart = namedtuple('TagStart', 'offset prefix name closing is_start') TagEnd = namedtuple('TagEnd', 'offset self_closing is_start') +class Tag(object): + + __slots__ = ('name', 'bold', 'italic', 'lang', 'hash') + + def __init__(self, name, bold=None, italic=None): + self.name = name + self.bold = name in bold_tags if bold is None else bold + self.italic = name in italic_tags if italic is None else italic + self.lang = None + self.hash = 0 + + def __hash__(self): + return self.hash + + def __eq__(self, other): + return self.name == getattr(other, 'name', None) and self.lang == getattr(other, 'lang', False) + + def copy(self): + ans = Tag(self.name, self.bold, self.italic) + ans.lang, ans.hash = self.lang, self.hash + return ans + + def update_hash(self): + self.hash = hash((self.name, self.lang)) + +class State(object): + + __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic', + 'current_lang', 'parse', 'get_user_data', 'set_user_data', + 'css_formats', 'stack', 'sub_parser_state', 'default_lang') + + def __init__(self): + self.tags = [] + self.is_bold = self.is_italic = False + self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \ + self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None + self.parse = NORMAL + + def copy(self): + ans = State() + for x in self.__slots__: + setattr(ans, x, getattr(self, x)) + self.tags = [x.copy() for x in self.tags] + if self.tag_being_defined is not None: + self.tag_being_defined = self.tag_being_defined.copy() + return ans + + @property + def value(self): + if self.tag_being_defined is not None: + self.tag_being_defined.update_hash() + return self.stack.index_for(self) + + def __hash__(self): + return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags))) + + def __eq__(self, other): + return ( + self.parse == getattr(other, 'parse', -1) and + self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and + self.tag_being_defined == getattr(other, 'tag_being_defined', False) and + self.tags == getattr(other, 'tags', None) + ) + + def open_tag(self, name): + self.tag_being_defined = Tag(name) + + def close_tag(self, name): + removed_tags = [] + for tag in reversed(self.tags): + removed_tags.append(tag) + if tag.name == name: + break + else: + return # No matching open tag found, ignore the closing tag + # Remove all tags upto the matching open tag + self.tags = self.tags[:-len(removed_tags)] + self.sub_parser_state = 0 + # Check if we should still be bold or italic + if self.is_bold: + self.is_bold = False + for tag in reversed(self.tags): + if tag.bold: + self.is_bold = True + break + if self.is_italic: + self.is_italic = False + for tag in reversed(self.tags): + if tag.italic: + self.is_italic = True + break + # Set the current language to the first lang attribute in a still open tag + self.current_lang = None + for tag in reversed(self.tags): + if tag.lang is not None: + self.current_lang = tag.lang + break + + def finish_opening_tag(self, cdata_tags): + self.parse = NORMAL + if self.tag_being_defined is None: + return + t, self.tag_being_defined = self.tag_being_defined, None + t.update_hash() + self.tags.append(t) + self.is_bold = self.is_bold or t.bold + self.is_italic = self.is_italic or t.italic + self.current_lang = t.lang or self.current_lang + if t.name in cdata_tags: + self.parse = CSS if t.name == 'style' else CDATA + self.sub_parser_state = 0 + + def __repr__(self): + return '' % ( + '->'.join(x.name for x in self.tags), self.is_bold, self.is_italic, self.current_lang) + __str__ = __repr__ + +class Stack(object): + + ''' Maintain an efficient bi-directional mapping between states and index + numbers. Ensures that if state1 == state2 then their corresponding index + numbers are the same and vice versa. This is need so that the state number + passed to Qt does not change unless the underlying state has actually + changed. ''' + + def __init__(self): + self.index_map = [] + self.state_map = {} + + def index_for(self, state): + ans = self.state_map.get(state, None) + if ans is None: + self.state_map[state] = ans = len(self.index_map) + self.index_map.append(state) + return ans + + def state_for(self, index): + try: + return self.index_map[index] + except IndexError: + return None + +class HTMLUserData(QTextBlockUserData): + + def __init__(self): + QTextBlockUserData.__init__(self) + self.tags = [] + def add_tag_data(state, tag): ud = q = state.get_user_data() if ud is None: @@ -97,37 +212,38 @@ def css(state, text, i, formats): else: css_text = text[i:m.start()] ans = [] - css_state = CSSState(state.css) + css_state = CSSState(state.sub_parser_state) for j, num, fmt in run_loop(css_state, css_state_map, state.css_formats, css_text): ans.append((num, fmt)) - state.css = css_state.value + state.sub_parser_state = css_state.value if m is not None: - state.clear() - state.parse = State.IN_CLOSING_TAG + state.sub_parser_state = 0 + state.parse = IN_CLOSING_TAG add_tag_data(state, TagStart(m.start(), 'style', '', True, True)) ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]) return ans def cdata(state, text, i, formats): 'CDATA inside tags like or <style>' - pat = cdata_close_pats[state.tag] + name = state.tags[-1].name + pat = cdata_close_pats[name] m = pat.search(text, i) - fmt = formats['title' if state.tag == 'title' else 'special'] + fmt = formats['title' if name == 'title' else 'special'] if m is None: return [(len(text) - i, fmt)] - state.parse = State.IN_CLOSING_TAG + state.parse = IN_CLOSING_TAG num = m.start() - i - add_tag_data(state, TagStart(m.start(), state.tag, '', True, True)) + add_tag_data(state, TagStart(m.start(), name, '', True, True)) return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])] def mark_nbsp(state, text, nbsp_format): ans = [] fmt = None - if state.bold or state.italic: + if state.is_bold or state.is_italic: fmt = SyntaxTextCharFormat() - if state.bold: + if state.is_bold: fmt.setFontWeight(QFont.Bold) - if state.italic: + if state.is_italic: fmt.setFontItalic(True) last = 0 for m in nbsp_pat.finditer(text): @@ -137,26 +253,20 @@ def mark_nbsp(state, text, nbsp_format): ans = [(len(text), fmt)] return ans -class HTMLUserData(QTextBlockUserData): - - def __init__(self): - QTextBlockUserData.__init__(self) - self.tags = [] - def normal(state, text, i, formats): ' The normal state in between tags ' ch = text[i] if ch == '<': if text[i:i+4] == '<!--': - state.parse, fmt = state.IN_COMMENT, formats['comment'] + state.parse, fmt = IN_COMMENT, formats['comment'] return [(4, fmt)] if text[i:i+2] == '<?': - state.parse, fmt = state.IN_PI, formats['preproc'] + state.parse, fmt = IN_PI, formats['preproc'] return [(2, fmt)] if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'): - state.parse, fmt = state.IN_DOCTYPE, formats['preproc'] + state.parse, fmt = IN_DOCTYPE, formats['preproc'] return [(2, fmt)] m = tag_name_pat.match(text, i + 1) @@ -165,16 +275,16 @@ def normal(state, text, i, formats): name = m.group() closing = name.startswith('/') - state.parse = state.IN_CLOSING_TAG if closing else state.IN_OPENING_TAG + state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG ans = [(2 if closing else 1, formats['end_tag' if closing else 'tag'])] if closing: name = name[1:] prefix, name = name.partition(':')[0::2] - state.tag = name or prefix if prefix and name: ans.append((len(prefix)+1, formats['nsprefix'])) ans.append((len(name or prefix), formats['tag_name'])) add_tag_data(state, TagStart(i, prefix, name, closing, True)) + (state.close_tag if closing else state.open_tag)(name or prefix) return ans if ch == '&': @@ -198,27 +308,18 @@ def opening_tag(cdata_tags, state, text, i, formats): m = self_closing_pat.match(text, i) if m is None: return [(1, formats['/'])] - state.parse = state.NORMAL - state.tag = State.UNKNOWN_TAG + state.parse = NORMAL l = len(m.group()) add_tag_data(state, TagEnd(i + l - 1, True, False)) return [(l, formats['tag'])] if ch == '>': - state.parse = state.NORMAL - tag = state.tag.lower() - if tag in cdata_tags: - state.parse = state.CDATA - if tag == 'style': - state.clear() - state.parse = state.CSS - state.bold += int(tag in bold_tags) - state.italic += int(tag in italic_tags) + state.finish_opening_tag(cdata_tags) add_tag_data(state, TagEnd(i, False, False)) return [(1, formats['tag'])] m = attribute_name_pat.match(text, i) if m is None: return [(1, formats['?'])] - state.parse = state.ATTRIBUTE_NAME + state.parse = ATTRIBUTE_NAME prefix, name = m.group().partition(':')[0::2] if prefix and name: return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])] @@ -230,9 +331,9 @@ def attribute_name(state, text, i, formats): if ch in space_chars: return [(1, None)] if ch == '=': - state.parse = State.ATTRIBUTE_VALUE + state.parse = ATTRIBUTE_VALUE return [(1, formats['attr'])] - state.parse = State.IN_OPENING_TAG + state.parse = IN_OPENING_TAG if ch in {'>', '/'}: # Standalone attribute with no value return [(0, None)] @@ -244,9 +345,9 @@ def attribute_value(state, text, i, formats): if ch in space_chars: return [(1, None)] if ch in {'"', "'"}: - state.parse = State.SQ_VAL if ch == "'" else State.DQ_VAL + state.parse = SQ_VAL if ch == "'" else DQ_VAL return [(1, formats['string'])] - state.parse = State.IN_OPENING_TAG + state.parse = IN_OPENING_TAG m = unquoted_val_pat.match(text, i) if m is None: return [(1, formats['no-attr-value'])] @@ -254,13 +355,13 @@ def attribute_value(state, text, i, formats): def quoted_val(state, text, i, formats): ' A quoted attribute value ' - quote = '"' if state.parse == State.DQ_VAL else "'" + quote = '"' if state.parse is DQ_VAL else "'" pos = text.find(quote, i) if pos == -1: num = len(text) - i else: num = pos - i + 1 - state.parse = State.IN_OPENING_TAG + state.parse = IN_OPENING_TAG return [(num, formats['string'])] def closing_tag(state, text, i, formats): @@ -271,48 +372,44 @@ def closing_tag(state, text, i, formats): pos = text.find('>', i) if pos == -1: return [(len(text) - i, formats['bad-closing'])] - state.parse = state.NORMAL - tag = state.tag.lower() - state.bold -= int(tag in bold_tags) - state.italic -= int(tag in italic_tags) + state.parse = NORMAL num = pos - i + 1 ans = [(1, formats['end_tag'])] if num > 1: ans.insert(0, (num - 1, formats['bad-closing'])) - state.tag = State.UNKNOWN_TAG add_tag_data(state, TagEnd(pos, False, False)) return ans def in_comment(state, text, i, formats): ' Comment, processing instruction or doctype ' - end = {state.IN_COMMENT:'-->', state.IN_PI:'?>'}.get(state.parse, '>') + end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>') pos = text.find(end, i) - fmt = formats['comment' if state.parse == state.IN_COMMENT else 'preproc'] + fmt = formats['comment' if state.parse is IN_COMMENT else 'preproc'] if pos == -1: num = len(text) - i else: num = pos - i + len(end) - state.parse = state.NORMAL + state.parse = NORMAL return [(num, fmt)] state_map = { - State.NORMAL:normal, - State.IN_OPENING_TAG: partial(opening_tag, cdata_tags), - State.IN_CLOSING_TAG: closing_tag, - State.ATTRIBUTE_NAME: attribute_name, - State.ATTRIBUTE_VALUE: attribute_value, - State.CDATA: cdata, - State.CSS: css, + NORMAL:normal, + IN_OPENING_TAG: partial(opening_tag, cdata_tags), + IN_CLOSING_TAG: closing_tag, + ATTRIBUTE_NAME: attribute_name, + ATTRIBUTE_VALUE: attribute_value, + CDATA: cdata, + CSS: css, } -for x in (State.IN_COMMENT, State.IN_PI, State.IN_DOCTYPE): +for x in (IN_COMMENT, IN_PI, IN_DOCTYPE): state_map[x] = in_comment -for x in (State.SQ_VAL, State.DQ_VAL): +for x in (SQ_VAL, DQ_VAL): state_map[x] = quoted_val xml_state_map = state_map.copy() -xml_state_map[State.IN_OPENING_TAG] = partial(opening_tag, set()) +xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set()) def create_formats(highlighter): t = highlighter.theme @@ -349,18 +446,19 @@ def create_formats(highlighter): class HTMLHighlighter(SyntaxHighlighter): state_map = state_map - state_class = State create_formats_func = create_formats def create_formats(self): super(HTMLHighlighter, self).create_formats() - self.css_formats = create_css_formats(self) - self.state_class = self.create_state + self.default_state = State() + self.default_state.css_formats = create_css_formats(self) + self.default_state.stack = Stack() def create_state(self, val): - ans = State(val) - ans.css_formats = self.css_formats - return ans + if val < 0: + return self.default_state.copy() + ans = self.default_state.stack.state_for(val) or self.default_state + return ans.copy() class XMLHighlighter(HTMLHighlighter):