mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor HTML syntax highlighter too keep track of tag nesting
This is needed for the eventual implementation of inline spellcheck
This commit is contained in:
parent
5481f1f820
commit
a1a4585167
@ -29,13 +29,15 @@ def run_loop(state, state_map, formats, text):
|
||||
|
||||
class SyntaxHighlighter(QSyntaxHighlighter):
|
||||
|
||||
state_class = SimpleState
|
||||
state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
|
||||
create_formats_func = lambda highlighter: {}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
QSyntaxHighlighter.__init__(self, *args, **kwargs)
|
||||
|
||||
def create_state(self, num):
|
||||
return SimpleState(max(0, num))
|
||||
|
||||
def rehighlight(self):
|
||||
self.outlineexplorer_data = {}
|
||||
QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
|
||||
@ -54,9 +56,7 @@ class SyntaxHighlighter(QSyntaxHighlighter):
|
||||
try:
|
||||
state = self.previousBlockState()
|
||||
self.setCurrentBlockUserData(None) # Ensure that any stale user data is discarded
|
||||
if state == -1:
|
||||
state = 0
|
||||
state = self.state_class(state)
|
||||
state = self.create_state(state)
|
||||
state.get_user_data, state.set_user_data = self.currentBlockUserData, self.setCurrentBlockUserData
|
||||
for i, num, fmt in run_loop(state, self.state_map, self.formats, unicode(text)):
|
||||
if fmt is not None:
|
||||
|
@ -251,9 +251,11 @@ def create_formats(highlighter):
|
||||
class CSSHighlighter(SyntaxHighlighter):
|
||||
|
||||
state_map = state_map
|
||||
state_class = State
|
||||
create_formats_func = create_formats
|
||||
|
||||
def create_state(self, num):
|
||||
return State(max(0, num))
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.gui2.tweak_book.editor.widget import launch_editor
|
||||
launch_editor('''\
|
||||
|
@ -31,55 +31,170 @@ unquoted_val_pat = re.compile(r'''[^%s'"=<>`]+''' % space_chars)
|
||||
cdata_close_pats = {x:re.compile(r'</%s' % x, flags=re.I) for x in cdata_tags}
|
||||
nbsp_pat = re.compile('[\xa0\u2000-\u200A\u202F\u205F\u3000\u2011-\u2015\uFE58\uFE63\uFF0D]+') # special spaces and hyphens
|
||||
|
||||
class State(object):
|
||||
|
||||
''' Store the parsing state, a stack of bold and italic formatting and the
|
||||
last seen open tag, all in a single integer, so that it can be used with.
|
||||
This assumes an int is at least 32 bits.'''
|
||||
|
||||
NORMAL = 0
|
||||
IN_OPENING_TAG = 1
|
||||
IN_CLOSING_TAG = 2
|
||||
IN_COMMENT = 3
|
||||
IN_PI = 4
|
||||
IN_DOCTYPE = 5
|
||||
ATTRIBUTE_NAME = 6
|
||||
ATTRIBUTE_VALUE = 7
|
||||
SQ_VAL = 8
|
||||
DQ_VAL = 9
|
||||
CDATA = 10
|
||||
CSS = 11
|
||||
|
||||
TAGS = {x:i+1 for i, x in enumerate(cdata_tags | bold_tags | italic_tags)}
|
||||
TAGS_RMAP = {v:k for k, v in TAGS.iteritems()}
|
||||
UNKNOWN_TAG = '___'
|
||||
|
||||
def __init__(self, num):
|
||||
self.parse = num & 0b1111
|
||||
self.bold = (num >> 4) & 0b11111111
|
||||
self.italic = (num >> 12) & 0b11111111
|
||||
self.tag = self.TAGS_RMAP.get(num >> 20, self.UNKNOWN_TAG)
|
||||
self.css = 0
|
||||
if self.parse == State.CSS:
|
||||
self.css = num >> 4
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
if self.parse == State.CSS:
|
||||
return ((self.parse & 0b1111) | (self.css << 4))
|
||||
tag = self.TAGS.get(self.tag.lower(), 0)
|
||||
return ((self.parse & 0b1111) |
|
||||
((max(0, self.bold) & 0b11111111) << 4) |
|
||||
((max(0, self.italic) & 0b11111111) << 12) |
|
||||
(tag << 20))
|
||||
|
||||
def clear(self):
|
||||
self.parse = self.bold = self.italic = self.css = 0
|
||||
self.tag = self.UNKNOWN_TAG
|
||||
NORMAL = 0
|
||||
IN_OPENING_TAG = 1
|
||||
IN_CLOSING_TAG = 2
|
||||
IN_COMMENT = 3
|
||||
IN_PI = 4
|
||||
IN_DOCTYPE = 5
|
||||
ATTRIBUTE_NAME = 6
|
||||
ATTRIBUTE_VALUE = 7
|
||||
SQ_VAL = 8
|
||||
DQ_VAL = 9
|
||||
CDATA = 10
|
||||
CSS = 11
|
||||
|
||||
TagStart = namedtuple('TagStart', 'offset prefix name closing is_start')
|
||||
TagEnd = namedtuple('TagEnd', 'offset self_closing is_start')
|
||||
|
||||
class Tag(object):
|
||||
|
||||
__slots__ = ('name', 'bold', 'italic', 'lang', 'hash')
|
||||
|
||||
def __init__(self, name, bold=None, italic=None):
|
||||
self.name = name
|
||||
self.bold = name in bold_tags if bold is None else bold
|
||||
self.italic = name in italic_tags if italic is None else italic
|
||||
self.lang = None
|
||||
self.hash = 0
|
||||
|
||||
def __hash__(self):
|
||||
return self.hash
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.name == getattr(other, 'name', None) and self.lang == getattr(other, 'lang', False)
|
||||
|
||||
def copy(self):
|
||||
ans = Tag(self.name, self.bold, self.italic)
|
||||
ans.lang, ans.hash = self.lang, self.hash
|
||||
return ans
|
||||
|
||||
def update_hash(self):
|
||||
self.hash = hash((self.name, self.lang))
|
||||
|
||||
class State(object):
|
||||
|
||||
__slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
|
||||
'current_lang', 'parse', 'get_user_data', 'set_user_data',
|
||||
'css_formats', 'stack', 'sub_parser_state', 'default_lang')
|
||||
|
||||
def __init__(self):
|
||||
self.tags = []
|
||||
self.is_bold = self.is_italic = False
|
||||
self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
|
||||
self.css_formats = self.stack = self.sub_parser_state = self.default_lang = None
|
||||
self.parse = NORMAL
|
||||
|
||||
def copy(self):
|
||||
ans = State()
|
||||
for x in self.__slots__:
|
||||
setattr(ans, x, getattr(self, x))
|
||||
self.tags = [x.copy() for x in self.tags]
|
||||
if self.tag_being_defined is not None:
|
||||
self.tag_being_defined = self.tag_being_defined.copy()
|
||||
return ans
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
if self.tag_being_defined is not None:
|
||||
self.tag_being_defined.update_hash()
|
||||
return self.stack.index_for(self)
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.parse, self.sub_parser_state, self.tag_being_defined, tuple(self.tags)))
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.parse == getattr(other, 'parse', -1) and
|
||||
self.sub_parser_state == getattr(other, 'sub_parser_state', -1) and
|
||||
self.tag_being_defined == getattr(other, 'tag_being_defined', False) and
|
||||
self.tags == getattr(other, 'tags', None)
|
||||
)
|
||||
|
||||
def open_tag(self, name):
|
||||
self.tag_being_defined = Tag(name)
|
||||
|
||||
def close_tag(self, name):
|
||||
removed_tags = []
|
||||
for tag in reversed(self.tags):
|
||||
removed_tags.append(tag)
|
||||
if tag.name == name:
|
||||
break
|
||||
else:
|
||||
return # No matching open tag found, ignore the closing tag
|
||||
# Remove all tags upto the matching open tag
|
||||
self.tags = self.tags[:-len(removed_tags)]
|
||||
self.sub_parser_state = 0
|
||||
# Check if we should still be bold or italic
|
||||
if self.is_bold:
|
||||
self.is_bold = False
|
||||
for tag in reversed(self.tags):
|
||||
if tag.bold:
|
||||
self.is_bold = True
|
||||
break
|
||||
if self.is_italic:
|
||||
self.is_italic = False
|
||||
for tag in reversed(self.tags):
|
||||
if tag.italic:
|
||||
self.is_italic = True
|
||||
break
|
||||
# Set the current language to the first lang attribute in a still open tag
|
||||
self.current_lang = None
|
||||
for tag in reversed(self.tags):
|
||||
if tag.lang is not None:
|
||||
self.current_lang = tag.lang
|
||||
break
|
||||
|
||||
def finish_opening_tag(self, cdata_tags):
|
||||
self.parse = NORMAL
|
||||
if self.tag_being_defined is None:
|
||||
return
|
||||
t, self.tag_being_defined = self.tag_being_defined, None
|
||||
t.update_hash()
|
||||
self.tags.append(t)
|
||||
self.is_bold = self.is_bold or t.bold
|
||||
self.is_italic = self.is_italic or t.italic
|
||||
self.current_lang = t.lang or self.current_lang
|
||||
if t.name in cdata_tags:
|
||||
self.parse = CSS if t.name == 'style' else CDATA
|
||||
self.sub_parser_state = 0
|
||||
|
||||
def __repr__(self):
|
||||
return '<State %s is_bold=%s is_italic=%s current_lang=%s>' % (
|
||||
'->'.join(x.name for x in self.tags), self.is_bold, self.is_italic, self.current_lang)
|
||||
__str__ = __repr__
|
||||
|
||||
class Stack(object):
|
||||
|
||||
''' Maintain an efficient bi-directional mapping between states and index
|
||||
numbers. Ensures that if state1 == state2 then their corresponding index
|
||||
numbers are the same and vice versa. This is need so that the state number
|
||||
passed to Qt does not change unless the underlying state has actually
|
||||
changed. '''
|
||||
|
||||
def __init__(self):
|
||||
self.index_map = []
|
||||
self.state_map = {}
|
||||
|
||||
def index_for(self, state):
|
||||
ans = self.state_map.get(state, None)
|
||||
if ans is None:
|
||||
self.state_map[state] = ans = len(self.index_map)
|
||||
self.index_map.append(state)
|
||||
return ans
|
||||
|
||||
def state_for(self, index):
|
||||
try:
|
||||
return self.index_map[index]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
class HTMLUserData(QTextBlockUserData):
|
||||
|
||||
def __init__(self):
|
||||
QTextBlockUserData.__init__(self)
|
||||
self.tags = []
|
||||
|
||||
def add_tag_data(state, tag):
|
||||
ud = q = state.get_user_data()
|
||||
if ud is None:
|
||||
@ -97,37 +212,38 @@ def css(state, text, i, formats):
|
||||
else:
|
||||
css_text = text[i:m.start()]
|
||||
ans = []
|
||||
css_state = CSSState(state.css)
|
||||
css_state = CSSState(state.sub_parser_state)
|
||||
for j, num, fmt in run_loop(css_state, css_state_map, state.css_formats, css_text):
|
||||
ans.append((num, fmt))
|
||||
state.css = css_state.value
|
||||
state.sub_parser_state = css_state.value
|
||||
if m is not None:
|
||||
state.clear()
|
||||
state.parse = State.IN_CLOSING_TAG
|
||||
state.sub_parser_state = 0
|
||||
state.parse = IN_CLOSING_TAG
|
||||
add_tag_data(state, TagStart(m.start(), 'style', '', True, True))
|
||||
ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])])
|
||||
return ans
|
||||
|
||||
def cdata(state, text, i, formats):
|
||||
'CDATA inside tags like <title> or <style>'
|
||||
pat = cdata_close_pats[state.tag]
|
||||
name = state.tags[-1].name
|
||||
pat = cdata_close_pats[name]
|
||||
m = pat.search(text, i)
|
||||
fmt = formats['title' if state.tag == 'title' else 'special']
|
||||
fmt = formats['title' if name == 'title' else 'special']
|
||||
if m is None:
|
||||
return [(len(text) - i, fmt)]
|
||||
state.parse = State.IN_CLOSING_TAG
|
||||
state.parse = IN_CLOSING_TAG
|
||||
num = m.start() - i
|
||||
add_tag_data(state, TagStart(m.start(), state.tag, '', True, True))
|
||||
add_tag_data(state, TagStart(m.start(), name, '', True, True))
|
||||
return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]
|
||||
|
||||
def mark_nbsp(state, text, nbsp_format):
|
||||
ans = []
|
||||
fmt = None
|
||||
if state.bold or state.italic:
|
||||
if state.is_bold or state.is_italic:
|
||||
fmt = SyntaxTextCharFormat()
|
||||
if state.bold:
|
||||
if state.is_bold:
|
||||
fmt.setFontWeight(QFont.Bold)
|
||||
if state.italic:
|
||||
if state.is_italic:
|
||||
fmt.setFontItalic(True)
|
||||
last = 0
|
||||
for m in nbsp_pat.finditer(text):
|
||||
@ -137,26 +253,20 @@ def mark_nbsp(state, text, nbsp_format):
|
||||
ans = [(len(text), fmt)]
|
||||
return ans
|
||||
|
||||
class HTMLUserData(QTextBlockUserData):
|
||||
|
||||
def __init__(self):
|
||||
QTextBlockUserData.__init__(self)
|
||||
self.tags = []
|
||||
|
||||
def normal(state, text, i, formats):
|
||||
' The normal state in between tags '
|
||||
ch = text[i]
|
||||
if ch == '<':
|
||||
if text[i:i+4] == '<!--':
|
||||
state.parse, fmt = state.IN_COMMENT, formats['comment']
|
||||
state.parse, fmt = IN_COMMENT, formats['comment']
|
||||
return [(4, fmt)]
|
||||
|
||||
if text[i:i+2] == '<?':
|
||||
state.parse, fmt = state.IN_PI, formats['preproc']
|
||||
state.parse, fmt = IN_PI, formats['preproc']
|
||||
return [(2, fmt)]
|
||||
|
||||
if text[i:i+2] == '<!' and text[i+2:].lstrip().lower().startswith('doctype'):
|
||||
state.parse, fmt = state.IN_DOCTYPE, formats['preproc']
|
||||
state.parse, fmt = IN_DOCTYPE, formats['preproc']
|
||||
return [(2, fmt)]
|
||||
|
||||
m = tag_name_pat.match(text, i + 1)
|
||||
@ -165,16 +275,16 @@ def normal(state, text, i, formats):
|
||||
|
||||
name = m.group()
|
||||
closing = name.startswith('/')
|
||||
state.parse = state.IN_CLOSING_TAG if closing else state.IN_OPENING_TAG
|
||||
state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG
|
||||
ans = [(2 if closing else 1, formats['end_tag' if closing else 'tag'])]
|
||||
if closing:
|
||||
name = name[1:]
|
||||
prefix, name = name.partition(':')[0::2]
|
||||
state.tag = name or prefix
|
||||
if prefix and name:
|
||||
ans.append((len(prefix)+1, formats['nsprefix']))
|
||||
ans.append((len(name or prefix), formats['tag_name']))
|
||||
add_tag_data(state, TagStart(i, prefix, name, closing, True))
|
||||
(state.close_tag if closing else state.open_tag)(name or prefix)
|
||||
return ans
|
||||
|
||||
if ch == '&':
|
||||
@ -198,27 +308,18 @@ def opening_tag(cdata_tags, state, text, i, formats):
|
||||
m = self_closing_pat.match(text, i)
|
||||
if m is None:
|
||||
return [(1, formats['/'])]
|
||||
state.parse = state.NORMAL
|
||||
state.tag = State.UNKNOWN_TAG
|
||||
state.parse = NORMAL
|
||||
l = len(m.group())
|
||||
add_tag_data(state, TagEnd(i + l - 1, True, False))
|
||||
return [(l, formats['tag'])]
|
||||
if ch == '>':
|
||||
state.parse = state.NORMAL
|
||||
tag = state.tag.lower()
|
||||
if tag in cdata_tags:
|
||||
state.parse = state.CDATA
|
||||
if tag == 'style':
|
||||
state.clear()
|
||||
state.parse = state.CSS
|
||||
state.bold += int(tag in bold_tags)
|
||||
state.italic += int(tag in italic_tags)
|
||||
state.finish_opening_tag(cdata_tags)
|
||||
add_tag_data(state, TagEnd(i, False, False))
|
||||
return [(1, formats['tag'])]
|
||||
m = attribute_name_pat.match(text, i)
|
||||
if m is None:
|
||||
return [(1, formats['?'])]
|
||||
state.parse = state.ATTRIBUTE_NAME
|
||||
state.parse = ATTRIBUTE_NAME
|
||||
prefix, name = m.group().partition(':')[0::2]
|
||||
if prefix and name:
|
||||
return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
|
||||
@ -230,9 +331,9 @@ def attribute_name(state, text, i, formats):
|
||||
if ch in space_chars:
|
||||
return [(1, None)]
|
||||
if ch == '=':
|
||||
state.parse = State.ATTRIBUTE_VALUE
|
||||
state.parse = ATTRIBUTE_VALUE
|
||||
return [(1, formats['attr'])]
|
||||
state.parse = State.IN_OPENING_TAG
|
||||
state.parse = IN_OPENING_TAG
|
||||
if ch in {'>', '/'}:
|
||||
# Standalone attribute with no value
|
||||
return [(0, None)]
|
||||
@ -244,9 +345,9 @@ def attribute_value(state, text, i, formats):
|
||||
if ch in space_chars:
|
||||
return [(1, None)]
|
||||
if ch in {'"', "'"}:
|
||||
state.parse = State.SQ_VAL if ch == "'" else State.DQ_VAL
|
||||
state.parse = SQ_VAL if ch == "'" else DQ_VAL
|
||||
return [(1, formats['string'])]
|
||||
state.parse = State.IN_OPENING_TAG
|
||||
state.parse = IN_OPENING_TAG
|
||||
m = unquoted_val_pat.match(text, i)
|
||||
if m is None:
|
||||
return [(1, formats['no-attr-value'])]
|
||||
@ -254,13 +355,13 @@ def attribute_value(state, text, i, formats):
|
||||
|
||||
def quoted_val(state, text, i, formats):
|
||||
' A quoted attribute value '
|
||||
quote = '"' if state.parse == State.DQ_VAL else "'"
|
||||
quote = '"' if state.parse is DQ_VAL else "'"
|
||||
pos = text.find(quote, i)
|
||||
if pos == -1:
|
||||
num = len(text) - i
|
||||
else:
|
||||
num = pos - i + 1
|
||||
state.parse = State.IN_OPENING_TAG
|
||||
state.parse = IN_OPENING_TAG
|
||||
return [(num, formats['string'])]
|
||||
|
||||
def closing_tag(state, text, i, formats):
|
||||
@ -271,48 +372,44 @@ def closing_tag(state, text, i, formats):
|
||||
pos = text.find('>', i)
|
||||
if pos == -1:
|
||||
return [(len(text) - i, formats['bad-closing'])]
|
||||
state.parse = state.NORMAL
|
||||
tag = state.tag.lower()
|
||||
state.bold -= int(tag in bold_tags)
|
||||
state.italic -= int(tag in italic_tags)
|
||||
state.parse = NORMAL
|
||||
num = pos - i + 1
|
||||
ans = [(1, formats['end_tag'])]
|
||||
if num > 1:
|
||||
ans.insert(0, (num - 1, formats['bad-closing']))
|
||||
state.tag = State.UNKNOWN_TAG
|
||||
add_tag_data(state, TagEnd(pos, False, False))
|
||||
return ans
|
||||
|
||||
def in_comment(state, text, i, formats):
|
||||
' Comment, processing instruction or doctype '
|
||||
end = {state.IN_COMMENT:'-->', state.IN_PI:'?>'}.get(state.parse, '>')
|
||||
end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>')
|
||||
pos = text.find(end, i)
|
||||
fmt = formats['comment' if state.parse == state.IN_COMMENT else 'preproc']
|
||||
fmt = formats['comment' if state.parse is IN_COMMENT else 'preproc']
|
||||
if pos == -1:
|
||||
num = len(text) - i
|
||||
else:
|
||||
num = pos - i + len(end)
|
||||
state.parse = state.NORMAL
|
||||
state.parse = NORMAL
|
||||
return [(num, fmt)]
|
||||
|
||||
state_map = {
|
||||
State.NORMAL:normal,
|
||||
State.IN_OPENING_TAG: partial(opening_tag, cdata_tags),
|
||||
State.IN_CLOSING_TAG: closing_tag,
|
||||
State.ATTRIBUTE_NAME: attribute_name,
|
||||
State.ATTRIBUTE_VALUE: attribute_value,
|
||||
State.CDATA: cdata,
|
||||
State.CSS: css,
|
||||
NORMAL:normal,
|
||||
IN_OPENING_TAG: partial(opening_tag, cdata_tags),
|
||||
IN_CLOSING_TAG: closing_tag,
|
||||
ATTRIBUTE_NAME: attribute_name,
|
||||
ATTRIBUTE_VALUE: attribute_value,
|
||||
CDATA: cdata,
|
||||
CSS: css,
|
||||
}
|
||||
|
||||
for x in (State.IN_COMMENT, State.IN_PI, State.IN_DOCTYPE):
|
||||
for x in (IN_COMMENT, IN_PI, IN_DOCTYPE):
|
||||
state_map[x] = in_comment
|
||||
|
||||
for x in (State.SQ_VAL, State.DQ_VAL):
|
||||
for x in (SQ_VAL, DQ_VAL):
|
||||
state_map[x] = quoted_val
|
||||
|
||||
xml_state_map = state_map.copy()
|
||||
xml_state_map[State.IN_OPENING_TAG] = partial(opening_tag, set())
|
||||
xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set())
|
||||
|
||||
def create_formats(highlighter):
|
||||
t = highlighter.theme
|
||||
@ -349,18 +446,19 @@ def create_formats(highlighter):
|
||||
class HTMLHighlighter(SyntaxHighlighter):
|
||||
|
||||
state_map = state_map
|
||||
state_class = State
|
||||
create_formats_func = create_formats
|
||||
|
||||
def create_formats(self):
|
||||
super(HTMLHighlighter, self).create_formats()
|
||||
self.css_formats = create_css_formats(self)
|
||||
self.state_class = self.create_state
|
||||
self.default_state = State()
|
||||
self.default_state.css_formats = create_css_formats(self)
|
||||
self.default_state.stack = Stack()
|
||||
|
||||
def create_state(self, val):
|
||||
ans = State(val)
|
||||
ans.css_formats = self.css_formats
|
||||
return ans
|
||||
if val < 0:
|
||||
return self.default_state.copy()
|
||||
ans = self.default_state.stack.state_for(val) or self.default_state
|
||||
return ans.copy()
|
||||
|
||||
class XMLHighlighter(HTMLHighlighter):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user