Refactor HTML syntax highlighter too keep track of tag nesting

This is needed for the eventual implementation of inline spellcheck
This commit is contained in:
Kovid Goyal 2014-02-19 15:12:53 +05:30
parent 5481f1f820
commit a1a4585167
3 changed files with 214 additions and 114 deletions

View File

@ -29,13 +29,15 @@ def run_loop(state, state_map, formats, text):
class SyntaxHighlighter(QSyntaxHighlighter):
state_class = SimpleState
state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
create_formats_func = lambda highlighter: {}
def __init__(self, *args, **kwargs):
QSyntaxHighlighter.__init__(self, *args, **kwargs)
def create_state(self, num):
return SimpleState(max(0, num))
def rehighlight(self):
self.outlineexplorer_data = {}
QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
@ -54,9 +56,7 @@ class SyntaxHighlighter(QSyntaxHighlighter):
try:
state = self.previousBlockState()
self.setCurrentBlockUserData(None) # Ensure that any stale user data is discarded
if state == -1:
state = 0
state = self.state_class(state)
state = self.create_state(state)
state.get_user_data, state.set_user_data = self.currentBlockUserData, self.setCurrentBlockUserData
for i, num, fmt in run_loop(state, self.state_map, self.formats, unicode(text)):
if fmt is not None:

View File

@ -251,9 +251,11 @@ def create_formats(highlighter):
class CSSHighlighter(SyntaxHighlighter):
state_map = state_map
state_class = State
create_formats_func = create_formats
def create_state(self, num):
return State(max(0, num))
if __name__ == '__main__':
from calibre.gui2.tweak_book.editor.widget import launch_editor
launch_editor('''\

View File

@ -31,55 +31,170 @@ unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
class State(object):
''' Store the parsing state, a stack of bold and italic formatting and the
last seen open tag, all in a single integer, so that it can be used with.
This assumes an int is at least 32 bits.'''
NORMAL = 0
IN_OPENING_TAG = 1
IN_CLOSING_TAG = 2
IN_COMMENT = 3
IN_PI = 4
IN_DOCTYPE = 5
ATTRIBUTE_NAME = 6
ATTRIBUTE_VALUE = 7
SQ_VAL = 8
DQ_VAL = 9
CDATA = 10
CSS = 11
TAGS = {x:i+1 for i, x in enumerate(cdata_tags | bold_tags | italic_tags)}
TAGS_RMAP = {v:k for k, v in TAGS.iteritems()}
UNKNOWN_TAG = '___'
def __init__(self, num):
self.parse = num & 0b1111
self.bold = (num >> 4) & 0b11111111
self.italic = (num >> 12) & 0b11111111
self.tag = self.TAGS_RMAP.get(num >> 20, self.UNKNOWN_TAG)
self.css = 0
if self.parse == State.CSS:
self.css = num >> 4
@property
def value(self):
if self.parse == State.CSS:
return ((self.parse & 0b1111) | (self.css << 4))
tag = self.TAGS.get(self.tag.lower(), 0)
return ((self.parse & 0b1111) |
((max(0, self.bold) & 0b11111111) << 4) |
((max(0, self.italic) & 0b11111111) << 12) |
(tag << 20))
def clear(self):
self.parse = self.bold = self.italic = self.css = 0
self.tag = self.UNKNOWN_TAG
NORMAL = 0
IN_OPENING_TAG = 1
IN_CLOSING_TAG = 2
IN_COMMENT = 3
IN_PI = 4
IN_DOCTYPE = 5
ATTRIBUTE_NAME = 6
ATTRIBUTE_VALUE = 7
SQ_VAL = 8
DQ_VAL = 9
CDATA = 10
CSS = 11
TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
class Tag(object):
__slots__ = ('name', 'bold', 'italic', 'lang', 'hash')
def __init__(self, name, bold=None, italic=None):
self.name = name
self.bold = name in bold_tags if bold is None else bold
self.italic = name in italic_tags if italic is None else italic
self.lang = None
self.hash = 0
def __hash__(self):
return self.hash
def __eq__(self, other):
return self.name == getattr(other, 'name', None) and self.lang == getattr(other, 'lang', False)
def copy(self):
ans = Tag(self.name, self.bold, self.italic)
ans.lang, ans.hash = self.lang, self.hash
return ans
def update_hash(self):
self.hash = hash((self.name, self.lang))
class State(object):
__slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
'current_lang', 'parse', 'get_user_data', 'set_user_data',
'css_formats', 'stack', 'sub_parser_state', 'default_lang')
def __init__(self):
self.tags = []
self.is_bold = self.is_italic = False
self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
self.parse = NORMAL
def copy(self):
ans = State()
for x in self.__slots__:
setattr(ans, x, getattr(self, x))
self.tags = [x.copy() for x in self.tags]
if self.tag_being_defined is not None:
self.tag_being_defined = self.tag_being_defined.copy()
return ans
@property
def value(self):
if self.tag_being_defined is not None:
self.tag_being_defined.update_hash()
return self.stack.index_for(self)
def __hash__(self):
return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
def __eq__(self, other):
return (
self.parse == getattr(other, 'parse', -1) and
self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
self.tags == getattr(other, 'tags', None)
)
def open_tag(self, name):
self.tag_being_defined = Tag(name)
def close_tag(self, name):
removed_tags = []
for tag in reversed(self.tags):
removed_tags.append(tag)
if tag.name == name:
break
else:
return # No matching open tag found, ignore the closing tag
# Remove all tags upto the matching open tag
self.tags = self.tags[:-len(removed_tags)]
self.sub_parser_state = 0
# Check if we should still be bold or italic
if self.is_bold:
self.is_bold = False
for tag in reversed(self.tags):
if tag.bold:
self.is_bold = True
break
if self.is_italic:
self.is_italic = False
for tag in reversed(self.tags):
if tag.italic:
self.is_italic = True
break
# Set the current language to the first lang attribute in a still open tag
self.current_lang = None
for tag in reversed(self.tags):
if tag.lang is not None:
self.current_lang = tag.lang
break
def finish_opening_tag(self, cdata_tags):
self.parse = NORMAL
if self.tag_being_defined is None:
return
t, self.tag_being_defined = self.tag_being_defined, None
t.update_hash()
self.tags.append(t)
self.is_bold = self.is_bold or t.bold
self.is_italic = self.is_italic or t.italic
self.current_lang = t.lang or self.current_lang
if t.name in cdata_tags:
self.parse = CSS if t.name == 'style' else CDATA
self.sub_parser_state = 0
def __repr__(self):
return '<State %s is_bold=%s is_italic=%s current_lang=%s>' % (
'->'.join(x.name for x in self.tags), self.is_bold, self.is_italic, self.current_lang)
__str__ = __repr__
class Stack(object):
''' Maintain an efficient bi-directional mapping between states and index
numbers. Ensures that if state1 == state2 then their corresponding index
numbers are the same and vice versa. This is need so that the state number
passed to Qt does not change unless the underlying state has actually
changed. '''
def __init__(self):
self.index_map = []
self.state_map = {}
def index_for(self, state):
ans = self.state_map.get(state, None)
if ans is None:
self.state_map[state] = ans = len(self.index_map)
self.index_map.append(state)
return ans
def state_for(self, index):
try:
return self.index_map[index]
except IndexError:
return None
class HTMLUserData(QTextBlockUserData):
def __init__(self):
QTextBlockUserData.__init__(self)
self.tags = []
def add_tag_data(state, tag):
ud = q = state.get_user_data()
if ud is None:
@ -97,37 +212,38 @@ def css(state, text, i, formats):
else:
css_text = text[i:m.start()]
ans = []
css_state = CSSState(state.css)
css_state = CSSState(state.sub_parser_state)
for j, num, fmt in run_loop(css_state, css_state_map, state.css_formats, css_text):
ans.append((num, fmt))
state.css = css_state.value
state.sub_parser_state = css_state.value
if m is not None:
state.clear()
state.parse = State.IN_CLOSING_TAG
state.sub_parser_state = 0
state.parse = IN_CLOSING_TAG
add_tag_data(state, TagStart(m.start(), 'style', '', True, True))
ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])])
return ans
def cdata(state, text, i, formats):
'CDATA inside tags like <title> or <style>'
pat = cdata_close_pats[state.tag]
name = state.tags[-1].name
pat = cdata_close_pats[name]
m = pat.search(text, i)
fmt = formats['title' if state.tag == 'title' else 'special']
fmt = formats['title' if name == 'title' else 'special']
if m is None:
return [(len(text) - i, fmt)]
state.parse = State.IN_CLOSING_TAG
state.parse = IN_CLOSING_TAG
num = m.start() - i
add_tag_data(state, TagStart(m.start(), state.tag, '', True, True))
add_tag_data(state, TagStart(m.start(), name, '', True, True))
return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]
def mark_nbsp(state, text, nbsp_format):
ans = []
fmt = None
if state.bold or state.italic:
if state.is_bold or state.is_italic:
fmt = SyntaxTextCharFormat()
if state.bold:
if state.is_bold:
fmt.setFontWeight(QFont.Bold)
if state.italic:
if state.is_italic:
fmt.setFontItalic(True)
last = 0
for m in nbsp_pat.finditer(text):
@ -137,26 +253,20 @@ def mark_nbsp(state, text, nbsp_format):
ans = [(len(text), fmt)]
return ans
class HTMLUserData(QTextBlockUserData):
def __init__(self):
QTextBlockUserData.__init__(self)
self.tags = []
def normal(state, text, i, formats):
' The normal state in between tags '
ch = text[i]
if ch == '<':
if text[i:i+4] == '<!--':
state.parse, fmt = state.IN_COMMENT, formats['comment']
state.parse, fmt = IN_COMMENT, formats['comment']
return [(4, fmt)]
if text[i:i+2] == '<?':
state.parse, fmt = state.IN_PI, formats['preproc']
state.parse, fmt = IN_PI, formats['preproc']
return [(2, fmt)]
if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'):
state.parse, fmt = state.IN_DOCTYPE, formats['preproc']
state.parse, fmt = IN_DOCTYPE, formats['preproc']
return [(2, fmt)]
m = tag_name_pat.match(text, i + 1)
@ -165,16 +275,16 @@ def normal(state, text, i, formats):
name = m.group()
closing = name.startswith('/')
state.parse = state.IN_CLOSING_TAG if closing else state.IN_OPENING_TAG
state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG
ans = [(2 if closing else 1, formats['end_tag' if closing else 'tag'])]
if closing:
name = name[1:]
prefix, name = name.partition(':')[0::2]
state.tag = name or prefix
if prefix and name:
ans.append((len(prefix)+1, formats['nsprefix']))
ans.append((len(name or prefix), formats['tag_name']))
add_tag_data(state, TagStart(i, prefix, name, closing, True))
(state.close_tag if closing else state.open_tag)(name or prefix)
return ans
if ch == '&':
@ -198,27 +308,18 @@ def opening_tag(cdata_tags, state, text, i, formats):
m = self_closing_pat.match(text, i)
if m is None:
return [(1, formats['/'])]
state.parse = state.NORMAL
state.tag = State.UNKNOWN_TAG
state.parse = NORMAL
l = len(m.group())
add_tag_data(state, TagEnd(i + l - 1, True, False))
return [(l, formats['tag'])]
if ch == '>':
state.parse = state.NORMAL
tag = state.tag.lower()
if tag in cdata_tags:
state.parse = state.CDATA
if tag == 'style':
state.clear()
state.parse = state.CSS
state.bold += int(tag in bold_tags)
state.italic += int(tag in italic_tags)
state.finish_opening_tag(cdata_tags)
add_tag_data(state, TagEnd(i, False, False))
return [(1, formats['tag'])]
m = attribute_name_pat.match(text, i)
if m is None:
return [(1, formats['?'])]
state.parse = state.ATTRIBUTE_NAME
state.parse = ATTRIBUTE_NAME
prefix, name = m.group().partition(':')[0::2]
if prefix and name:
return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
@ -230,9 +331,9 @@ def attribute_name(state, text, i, formats):
if ch in space_chars:
return [(1, None)]
if ch == '=':
state.parse = State.ATTRIBUTE_VALUE
state.parse = ATTRIBUTE_VALUE
return [(1, formats['attr'])]
state.parse = State.IN_OPENING_TAG
state.parse = IN_OPENING_TAG
if ch in {'>', '/'}:
# Standalone attribute with no value
return [(0, None)]
@ -244,9 +345,9 @@ def attribute_value(state, text, i, formats):
if ch in space_chars:
return [(1, None)]
if ch in {'"', "'"}:
state.parse = State.SQ_VAL if ch == "'" else State.DQ_VAL
state.parse = SQ_VAL if ch == "'" else DQ_VAL
return [(1, formats['string'])]
state.parse = State.IN_OPENING_TAG
state.parse = IN_OPENING_TAG
m = unquoted_val_pat.match(text, i)
if m is None:
return [(1, formats['no-attr-value'])]
@ -254,13 +355,13 @@ def attribute_value(state, text, i, formats):
def quoted_val(state, text, i, formats):
' A quoted attribute value '
quote = '"' if state.parse == State.DQ_VAL else "'"
quote = '"' if state.parse is DQ_VAL else "'"
pos = text.find(quote, i)
if pos == -1:
num = len(text) - i
else:
num = pos - i + 1
state.parse = State.IN_OPENING_TAG
state.parse = IN_OPENING_TAG
return [(num, formats['string'])]
def closing_tag(state, text, i, formats):
@ -271,48 +372,44 @@ def closing_tag(state, text, i, formats):
pos = text.find('>', i)
if pos == -1:
return [(len(text) - i, formats['bad-closing'])]
state.parse = state.NORMAL
tag = state.tag.lower()
state.bold -= int(tag in bold_tags)
state.italic -= int(tag in italic_tags)
state.parse = NORMAL
num = pos - i + 1
ans = [(1, formats['end_tag'])]
if num > 1:
ans.insert(0, (num - 1, formats['bad-closing']))
state.tag = State.UNKNOWN_TAG
add_tag_data(state, TagEnd(pos, False, False))
return ans
def in_comment(state, text, i, formats):
' Comment, processing instruction or doctype '
end = {state.IN_COMMENT:'-->', state.IN_PI:'?>'}.get(state.parse, '>')
end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>')
pos = text.find(end, i)
fmt = formats['comment' if state.parse == state.IN_COMMENT else 'preproc']
fmt = formats['comment' if state.parse is IN_COMMENT else 'preproc']
if pos == -1:
num = len(text) - i
else:
num = pos - i + len(end)
state.parse = state.NORMAL
state.parse = NORMAL
return [(num, fmt)]
state_map = {
State.NORMAL:normal,
State.IN_OPENING_TAG: partial(opening_tag, cdata_tags),
State.IN_CLOSING_TAG: closing_tag,
State.ATTRIBUTE_NAME: attribute_name,
State.ATTRIBUTE_VALUE: attribute_value,
State.CDATA: cdata,
State.CSS: css,
NORMAL:normal,
IN_OPENING_TAG: partial(opening_tag, cdata_tags),
IN_CLOSING_TAG: closing_tag,
ATTRIBUTE_NAME: attribute_name,
ATTRIBUTE_VALUE: attribute_value,
CDATA: cdata,
CSS: css,
}
for x in (State.IN_COMMENT, State.IN_PI, State.IN_DOCTYPE):
for x in (IN_COMMENT, IN_PI, IN_DOCTYPE):
state_map[x] = in_comment
for x in (State.SQ_VAL, State.DQ_VAL):
for x in (SQ_VAL, DQ_VAL):
state_map[x] = quoted_val
xml_state_map = state_map.copy()
xml_state_map[State.IN_OPENING_TAG] = partial(opening_tag, set())
xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set())
def create_formats(highlighter):
t = highlighter.theme
@ -349,18 +446,19 @@ def create_formats(highlighter):
class HTMLHighlighter(SyntaxHighlighter):
state_map = state_map
state_class = State
create_formats_func = create_formats
def create_formats(self):
super(HTMLHighlighter, self).create_formats()
self.css_formats = create_css_formats(self)
self.state_class = self.create_state
self.default_state = State()
self.default_state.css_formats = create_css_formats(self)
self.default_state.stack = Stack()
def create_state(self, val):
ans = State(val)
ans.css_formats = self.css_formats
return ans
if val < 0:
return self.default_state.copy()
ans = self.default_state.stack.state_for(val) or self.default_state
return ans.copy()
class XMLHighlighter(HTMLHighlighter):