Edit Book: Redesign the syntax highlighter to improve performance for large documents and extended editing sessions. Fixes #1314339 [edit book app "hangs" during edit session](https://bugs.launchpad.net/calibre/+bug/1314339)

2025-07-09 03:04:10 -04:00 · 2014-04-30 15:14:08 +05:30 · 2014-04-30 15:14:08 +05:30 · 16bee93353
commit 16bee93353
parent b381966b79
4 changed files with 239 additions and 182 deletions
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -6,44 +6,53 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
-from PyQt4.Qt import (QSyntaxHighlighter, QApplication, QCursor, Qt)
+import weakref
 from PyQt4.Qt import (
    QTextCursor, pyqtSlot, QTextBlockUserData, QTextLayout)
 from ..themes import highlight_to_char_format
 from calibre.gui2.tweak_book.widgets import BusyCursor
-class SimpleState(object):
+def run_loop(user_data, state_map, formats, text):
-
+    state = user_data.state
    def __init__(self, value):
        self.parse = value
    @property
    def value(self):
        return self.parse
 def run_loop(state, state_map, formats, text):
    i = 0
    while i < len(text):
-        fmt = state_map[state.parse](state, text, i, formats)
+        fmt = state_map[state.parse](state, text, i, formats, user_data)
        for num, f in fmt:
            yield i, num, f
            i += num
-class SyntaxHighlighter(QSyntaxHighlighter):
+class SimpleState(object):
    __slots__ = ('parse',)
    def __init__(self):
        self.parse = 0
    def copy(self):
        s = SimpleState()
        s.parse = self.parse
        return s
 class SimpleUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.state = SimpleState()
    def clear(self, state=None):
        self.state = SimpleState() if state is None else state
 class SyntaxHighlighter(object):
    state_map = {0:lambda state, text, i, formats:[(len(text), None)]}
    create_formats_func = lambda highlighter: {}
    spell_attributes = ()
    tag_ok_for_spell = lambda x: False
    user_data_factory = SimpleUserData
-    def __init__(self, *args, **kwargs):
+    def __init__(self):
-        QSyntaxHighlighter.__init__(self, *args, **kwargs)
+        self.document_ref = lambda : None
    def create_state(self, num):
        return SimpleState(max(0, num))
    def rehighlight(self):
        QApplication.setOverrideCursor(QCursor(Qt.WaitCursor))
        QSyntaxHighlighter.rehighlight(self)
        QApplication.restoreOverrideCursor()
    def apply_theme(self, theme):
        self.theme = {k:highlight_to_char_format(v) for k, v in theme.iteritems()}
@ -53,20 +62,94 @@ class SyntaxHighlighter(QSyntaxHighlighter):
    def create_formats(self):
        self.formats = self.create_formats_func()
-    def highlightBlock(self, text):
+    def set_document(self, doc):
-        try:
+        old_doc = self.document_ref()
-            state = self.previousBlockState()
+        if old_doc is not None:
-            self.setCurrentBlockUserData(None)  # Ensure that any stale user data is discarded
+            old_doc.contentsChange.disconnect(self.reformat_blocks)
-            state = self.create_state(state)
+            c = QTextCursor(old_doc)
-            state.get_user_data, state.set_user_data = self.currentBlockUserData, self.setCurrentBlockUserData
+            c.beginEditBlock()
-            for i, num, fmt in run_loop(state, self.state_map, self.formats, unicode(text)):
+            blk = old_doc.begin()
-                if fmt is not None:
+            while blk.isValid():
-                    self.setFormat(i, num, fmt)
+                blk.layout().clearAdditionalFormats()
-            self.setCurrentBlockState(state.value)
+                blk = blk.next()
-        except:
+            c.endEditBlock()
-            import traceback
+        if doc is not None:
-            traceback.print_exc()
+            self.document_ref = weakref.ref(doc)
-        finally:
+            doc.contentsChange.connect(self.reformat_blocks)
-            # Disabled as it causes crashes
+            self.rehighlight()
-            pass  # QApplication.processEvents()  # Try to keep the editor responsive to user input
+        else:
            self.document_ref = lambda : None
    def rehighlight(self):
        doc = self.document_ref()
        if doc is None:
            return
        lb = doc.lastBlock()
        with BusyCursor():
            self.reformat_blocks(0, 0, lb.position() + lb.length())
    def get_user_data(self, block):
        ud = block.userData()
        new_data = False
        if ud is None:
            ud = self.user_data_factory()
            block.setUserData(ud)
            new_data = True
        return ud, new_data
    @pyqtSlot(int, int, int)
    def reformat_blocks(self, position, removed, added):
        doc = self.document_ref()
        if doc is None:
            return
        last_block = doc.findBlock(position + added + (1 if removed > 0 else 0))
        if not last_block.isValid():
            last_block = doc.lastBlock()
        end_pos = last_block.position() + last_block.length()
        force_next_highlight = False
        doc.contentsChange.disconnect(self.reformat_blocks)
        try:
            block = doc.findBlock(position)
            while block.isValid() and (block.position() < end_pos or force_next_highlight):
                ud, new_ud = self.get_user_data(block)
                orig_state = ud.state
                pblock = block.previous()
                if pblock.isValid():
                    start_state = pblock.userData()
                    if start_state is None:
                        start_state = self.user_data_factory().state
                    else:
                        start_state = start_state.state.copy()
                else:
                    start_state = self.user_data_factory().state
                ud.clear(state=start_state)  # Ensure no stale user data lingers
                formats = []
                for i, num, fmt in run_loop(ud, self.state_map, self.formats, unicode(block.text())):
                    if fmt is not None:
                        formats.append((i, num, fmt))
                self.apply_format_changes(doc, block, formats)
                force_next_highlight = new_ud or ud.state != orig_state
                block = block.next()
        finally:
            doc.contentsChange.connect(self.reformat_blocks)
    def apply_format_changes(self, doc, block, formats):
        layout = block.layout()
        preedit_start = layout.preeditAreaPosition()
        preedit_length = layout.preeditAreaText().length()
        ranges = []
        R = QTextLayout.FormatRange
        for i, num, fmt in formats:
            # Adjust range by pre-edit text, if any
            if preedit_start != 0:
                if i >= preedit_start:
                    i += preedit_length
                elif i + num >= preedit_start:
                    num += preedit_length
            r = R()
            r.start, r.length, r.format = i, num, fmt
            ranges.append(r)
        layout.setAdditionalFormats(ranges)
        doc.markContentsDirty(block.position(), block.length())
--- a/src/calibre/gui2/tweak_book/editor/syntax/css.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/css.py
@ -8,6 +8,8 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import re
 from PyQt4.Qt import QTextBlockUserData
 from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat
 from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter
@ -118,41 +120,63 @@ content_tokens = [(re.compile(k), v, n) for k, v, n in [
 ]]
-class State(object):
+NORMAL = 0
 IN_COMMENT_NORMAL = 1
 IN_SQS = 2
 IN_DQS = 3
 IN_CONTENT = 4
 IN_COMMENT_CONTENT = 5
-    NORMAL = 0
+class CSSState(object):
    IN_COMMENT_NORMAL = 1
    IN_SQS = 2
    IN_DQS = 3
    IN_CONTENT = 4
    IN_COMMENT_CONTENT = 5
-    def __init__(self, num):
+    __slots__ = ('parse', 'blocks')
        self.parse  = num & 0b1111
        self.blocks = num >> 4
-    @property
+    def __init__(self):
-    def value(self):
+        self.parse  = NORMAL
-        return ((self.parse & 0b1111) | (max(0, self.blocks) << 4))
+        self.blocks = 0
    def copy(self):
        s = CSSState()
        s.parse, s.blocks = self.parse, self.blocks
        return s
-def normal(state, text, i, formats):
+    def __eq__(self, other):
        return self.parse == getattr(other, 'parse', -1) and \
            self.blocks == getattr(other, 'blocks', -1)
    def __ne__(self, other):
        return not self.__eq__(other)
    def __repr__(self):
        return "CSSState(parse=%s, blocks=%s)" % (self.parse, self.blocks)
    __str__ = __repr__
 class CSSUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.state = CSSState()
    def clear(self, state=None):
        self.state = CSSState() if state is None else state
 def normal(state, text, i, formats, user_data):
    ' The normal state (outside content blocks {})'
    m = space_pat.match(text, i)
    if m is not None:
        return [(len(m.group()), None)]
    cdo = cdo_pat.match(text, i)
    if cdo is not None:
-        state.parse = State.IN_COMMENT_NORMAL
+        state.parse = IN_COMMENT_NORMAL
        return [(len(cdo.group()), formats['comment'])]
    if text[i] == '"':
-        state.parse = State.IN_DQS
+        state.parse = IN_DQS
        return [(1, formats['string'])]
    if text[i] == "'":
-        state.parse = State.IN_SQS
+        state.parse = IN_SQS
        return [(1, formats['string'])]
    if text[i] == '{':
-        state.parse = State.IN_CONTENT
+        state.parse = IN_CONTENT
        state.blocks += 1
        return [(1, formats['bracket'])]
    for token, fmt, name in sheet_tokens:
@ -162,24 +186,24 @@ def normal(state, text, i, formats):
    return [(len(text) - i, formats['unknown-normal'])]
-def content(state, text, i, formats):
+def content(state, text, i, formats, user_data):
    ' Inside content blocks '
    m = space_pat.match(text, i)
    if m is not None:
        return [(len(m.group()), None)]
    cdo = cdo_pat.match(text, i)
    if cdo is not None:
-        state.parse = State.IN_COMMENT_CONTENT
+        state.parse = IN_COMMENT_CONTENT
        return [(len(cdo.group()), formats['comment'])]
    if text[i] == '"':
-        state.parse = State.IN_DQS
+        state.parse = IN_DQS
        return [(1, formats['string'])]
    if text[i] == "'":
-        state.parse = State.IN_SQS
+        state.parse = IN_SQS
        return [(1, formats['string'])]
    if text[i] == '}':
        state.blocks -= 1
-        state.parse = State.NORMAL if state.blocks < 1 else State.IN_CONTENT
+        state.parse = NORMAL if state.blocks < 1 else IN_CONTENT
        return [(1, formats['bracket'])]
    if text[i] == '{':
        state.blocks += 1
@ -191,34 +215,34 @@ def content(state, text, i, formats):
    return [(len(text) - i, formats['unknown-normal'])]
-def comment(state, text, i, formats):
+def comment(state, text, i, formats, user_data):
    ' Inside a comment '
    pos = text.find('*/', i)
    if pos == -1:
        return [(len(text), formats['comment'])]
-    state.parse = State.NORMAL if state.parse == State.IN_COMMENT_NORMAL else State.IN_CONTENT
+    state.parse = NORMAL if state.parse == IN_COMMENT_NORMAL else IN_CONTENT
    return [(pos - i + 2, formats['comment'])]
-def in_string(state, text, i, formats):
+def in_string(state, text, i, formats, user_data):
    'Inside a string'
-    q = '"' if state.parse == State.IN_DQS else "'"
+    q = '"' if state.parse == IN_DQS else "'"
    pos = text.find(q, i)
    if pos == -1:
        if text[-1] == '\\':
            # Multi-line string
            return [(len(text) - i, formats['string'])]
-        state.parse = (State.NORMAL if state.blocks < 1 else State.IN_CONTENT)
+        state.parse = (NORMAL if state.blocks < 1 else IN_CONTENT)
        return [(len(text) - i, formats['unterminated-string'])]
-    state.parse = (State.NORMAL if state.blocks < 1 else State.IN_CONTENT)
+    state.parse = (NORMAL if state.blocks < 1 else IN_CONTENT)
    return [(pos - i + len(q), formats['string'])]
 state_map = {
-    State.NORMAL:normal,
+    NORMAL:normal,
-    State.IN_COMMENT_NORMAL: comment,
+    IN_COMMENT_NORMAL: comment,
-    State.IN_COMMENT_CONTENT: comment,
+    IN_COMMENT_CONTENT: comment,
-    State.IN_SQS: in_string,
+    IN_SQS: in_string,
-    State.IN_DQS: in_string,
+    IN_DQS: in_string,
-    State.IN_CONTENT: content,
+    IN_CONTENT: content,
 }
 def create_formats(highlighter):
@ -252,9 +276,8 @@ class CSSHighlighter(SyntaxHighlighter):
    state_map = state_map
    create_formats_func = create_formats
    user_data_factory = CSSUserData
    def create_state(self, num):
        return State(max(0, num))
 if __name__ == '__main__':
    from calibre.gui2.tweak_book.editor.widget import launch_editor
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -15,7 +15,8 @@ from PyQt4.Qt import QFont, QTextBlockUserData
 from calibre.ebooks.oeb.polish.spell import html_spell_tags, xml_spell_tags
 from calibre.gui2.tweak_book.editor import SyntaxTextCharFormat
 from calibre.gui2.tweak_book.editor.syntax.base import SyntaxHighlighter, run_loop
-from calibre.gui2.tweak_book.editor.syntax.css import create_formats as create_css_formats, state_map as css_state_map, State as CSSState
+from calibre.gui2.tweak_book.editor.syntax.css import (
    create_formats as create_css_formats, state_map as css_state_map, CSSState, CSSUserData)
 from html5lib.constants import cdataElements, rcdataElements
@ -51,41 +52,33 @@ Attr = namedtuple('Attr', 'offset type data')
 class Tag(object):
-    __slots__ = ('name', 'bold', 'italic', 'lang', 'hash')
+    __slots__ = ('name', 'bold', 'italic', 'lang')
    def __init__(self, name, bold=None, italic=None):
        self.name = name
        self.bold = name in bold_tags if bold is None else bold
        self.italic = name in italic_tags if italic is None else italic
        self.lang = None
        self.hash = 0
    def __hash__(self):
        return self.hash
    def __eq__(self, other):
        return self.name == getattr(other, 'name', None) and self.lang == getattr(other, 'lang', False)
    def copy(self):
        ans = Tag(self.name, self.bold, self.italic)
-        ans.lang, ans.hash = self.lang, self.hash
+        ans.lang = self.lang
        return ans
    def update_hash(self):
        self.hash = hash((self.name, self.lang))
 class State(object):
-    __slots__ = ('tag_being_defined', 'tags', 'is_bold', 'is_italic',
+    __slots__ = (
-                 'current_lang', 'parse', 'get_user_data', 'set_user_data',
+        'tag_being_defined', 'tags', 'is_bold', 'is_italic', 'current_lang',
-                 'css_formats', 'stack', 'sub_parser_state', 'default_lang',
+        'parse', 'css_formats', 'sub_parser_state', 'default_lang', 'attribute_name',)
                 'attribute_name',)
    def __init__(self):
        self.tags = []
        self.is_bold = self.is_italic = False
-        self.tag_being_defined = self.current_lang = self.get_user_data = self.set_user_data = \
+        self.tag_being_defined = self.current_lang =  self.css_formats = \
-            self.css_formats = self.stack = self.sub_parser_state = self.default_lang = self.attribute_name = None
+            self.sub_parser_state = self.default_lang = self.attribute_name = None
        self.parse = NORMAL
    def copy(self):
@ -95,17 +88,10 @@ class State(object):
        self.tags = [x.copy() for x in self.tags]
        if self.tag_being_defined is not None:
            self.tag_being_defined = self.tag_being_defined.copy()
        if self.sub_parser_state is not None:
            ans.sub_parser_state = self.sub_parser_state.copy()
        return ans
    @property
    def value(self):
        if self.tag_being_defined is not None:
            self.tag_being_defined.update_hash()
        return self.stack.index_for(self)
    def __hash__(self):
        return hash((self.parse, self.sub_parser_state, self.tag_being_defined, self.attribute_name, tuple(self.tags)))
    def __eq__(self, other):
        return (
            self.parse == getattr(other, 'parse', -1) and
@ -115,6 +101,9 @@ class State(object):
            self.tags == getattr(other, 'tags', None)
        )
    def __ne__(self, other):
        return not self.__eq__(other)
    def open_tag(self, name):
        self.tag_being_defined = Tag(name)
@ -128,7 +117,7 @@ class State(object):
            return  # No matching open tag found, ignore the closing tag
        # Remove all tags upto the matching open tag
        self.tags = self.tags[:-len(removed_tags)]
-        self.sub_parser_state = 0
+        self.sub_parser_state = None
        # Check if we should still be bold or italic
        if self.is_bold:
            self.is_bold = False
@ -154,71 +143,41 @@ class State(object):
        if self.tag_being_defined is None:
            return
        t, self.tag_being_defined = self.tag_being_defined, None
        t.update_hash()
        self.tags.append(t)
        self.is_bold = self.is_bold or t.bold
        self.is_italic = self.is_italic or t.italic
        self.current_lang = t.lang or self.current_lang
        if t.name in cdata_tags:
            self.parse = CSS if t.name == 'style' else CDATA
-            self.sub_parser_state = 0
+            self.sub_parser_state = None
    def __repr__(self):
        return '<State %s is_bold=%s is_italic=%s current_lang=%s>' % (
            '->'.join(x.name for x in self.tags), self.is_bold, self.is_italic, self.current_lang)
    __str__ = __repr__
 class Stack(object):
    ''' Maintain an efficient bi-directional mapping between states and index
    numbers. Ensures that if state1 == state2 then their corresponding index
    numbers are the same and vice versa. This is need so that the state number
    passed to Qt does not change unless the underlying state has actually
    changed. '''
    def __init__(self):
        self.index_map = []
        self.state_map = {}
    def index_for(self, state):
        ans = self.state_map.get(state, None)
        if ans is None:
            self.state_map[state] = ans = len(self.index_map)
            self.index_map.append(state)
        return ans
    def state_for(self, index):
        try:
            return self.index_map[index]
        except IndexError:
            return None
 class HTMLUserData(QTextBlockUserData):
    def __init__(self):
        QTextBlockUserData.__init__(self)
        self.tags = []
        self.attributes = []
        self.state = State()
        self.css_user_data = None
-def add_tag_data(state, tag):
+    def clear(self, state=None):
-    ud = q = state.get_user_data()
+        self.tags, self.attributes = [], []
-    if ud is None:
+        self.state = State() if state is None else state
-        ud = HTMLUserData()
+
-    ud.tags.append(tag)
+def add_tag_data(user_data, tag):
-    if q is None:
+    user_data.tags.append(tag)
        state.set_user_data(ud)
 ATTR_NAME, ATTR_VALUE, ATTR_START, ATTR_END = object(), object(), object(), object()
-def add_attr_data(state, data_type, data, offset):
+def add_attr_data(user_data, data_type, data, offset):
-    ud = q = state.get_user_data()
+    user_data.attributes.append(Attr(offset, data_type, data))
    if ud is None:
        ud = HTMLUserData()
    ud.attributes.append(Attr(offset, data_type, data))
    if q is None:
        state.set_user_data(ud)
-def css(state, text, i, formats):
+def css(state, text, i, formats, user_data):
    ' Inside a <style> tag '
    pat = cdata_close_pats['style']
    m = pat.search(text, i)
@ -227,18 +186,18 @@ def css(state, text, i, formats):
    else:
        css_text = text[i:m.start()]
    ans = []
-    css_state = CSSState(state.sub_parser_state)
+    css_user_data = user_data.css_user_data = user_data.css_user_data or CSSUserData()
-    for j, num, fmt in run_loop(css_state, css_state_map, state.css_formats, css_text):
+    state.sub_parser_state = css_user_data.state = state.sub_parser_state or CSSState()
    for j, num, fmt in run_loop(css_user_data, css_state_map, formats['css_sub_formats'], css_text):
        ans.append((num, fmt))
    state.sub_parser_state = css_state.value
    if m is not None:
-        state.sub_parser_state = 0
+        state.sub_parser_state = None
        state.parse = IN_CLOSING_TAG
-        add_tag_data(state, TagStart(m.start(), '', 'style', True, True))
+        add_tag_data(user_data, TagStart(m.start(), '', 'style', True, True))
        ans.extend([(2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])])
    return ans
-def cdata(state, text, i, formats):
+def cdata(state, text, i, formats, user_data):
    'CDATA inside tags like <title> or <style>'
    name = state.tags[-1].name
    pat = cdata_close_pats[name]
@ -248,7 +207,7 @@ def cdata(state, text, i, formats):
        return [(len(text) - i, fmt)]
    state.parse = IN_CLOSING_TAG
    num = m.start() - i
-    add_tag_data(state, TagStart(m.start(), '', name, True, True))
+    add_tag_data(user_data, TagStart(m.start(), '', name, True, True))
    return [(num, fmt), (2, formats['end_tag']), (len(m.group()) - 2, formats['tag_name'])]
 def mark_nbsp(state, text, nbsp_format):
@ -268,7 +227,7 @@ def mark_nbsp(state, text, nbsp_format):
        ans = [(len(text), fmt)]
    return ans
-def normal(state, text, i, formats):
+def normal(state, text, i, formats, user_data):
    ' The normal state in between tags '
    ch = text[i]
    if ch == '<':
@ -303,7 +262,7 @@ def normal(state, text, i, formats):
            ans.append((len(prefix)+1, formats['nsprefix']))
        ans.append((len(name), formats['tag_name']))
        state.parse = IN_CLOSING_TAG if closing else IN_OPENING_TAG
-        add_tag_data(state, TagStart(i, prefix, name, closing, True))
+        add_tag_data(user_data, TagStart(i, prefix, name, closing, True))
        (state.close_tag if closing else state.open_tag)(name)
        return ans
@ -319,7 +278,7 @@ def normal(state, text, i, formats):
    t = normal_pat.search(text, i).group()
    return mark_nbsp(state, t, formats['nbsp'])
-def opening_tag(cdata_tags, state, text, i, formats):
+def opening_tag(cdata_tags, state, text, i, formats, user_data):
    'An opening tag, like <a>'
    ch = text[i]
    if ch in space_chars:
@ -330,24 +289,24 @@ def opening_tag(cdata_tags, state, text, i, formats):
            return [(1, formats['/'])]
        state.parse = NORMAL
        l = len(m.group())
-        add_tag_data(state, TagEnd(i + l - 1, True, False))
+        add_tag_data(user_data, TagEnd(i + l - 1, True, False))
        return [(l, formats['tag'])]
    if ch == '>':
        state.finish_opening_tag(cdata_tags)
-        add_tag_data(state, TagEnd(i, False, False))
+        add_tag_data(user_data, TagEnd(i, False, False))
        return [(1, formats['tag'])]
    m = attribute_name_pat.match(text, i)
    if m is None:
        return [(1, formats['?'])]
    state.parse = ATTRIBUTE_NAME
    attrname = state.attribute_name = m.group()
-    add_attr_data(state, ATTR_NAME, attrname, m.start())
+    add_attr_data(user_data, ATTR_NAME, attrname, m.start())
    prefix, name = attrname.partition(':')[0::2]
    if prefix and name:
        return [(len(prefix) + 1, formats['nsprefix']), (len(name), formats['attr'])]
    return [(len(prefix), formats['attr'])]
-def attribute_name(state, text, i, formats):
+def attribute_name(state, text, i, formats, user_data):
    ' After attribute name '
    ch = text[i]
    if ch in space_chars:
@ -359,7 +318,7 @@ def attribute_name(state, text, i, formats):
    state.parse = IN_OPENING_TAG
    return [(0, None)]
-def attribute_value(state, text, i, formats):
+def attribute_value(state, text, i, formats, user_data):
    ' After attribute = '
    ch = text[i]
    if ch in space_chars:
@ -373,20 +332,20 @@ def attribute_value(state, text, i, formats):
        return [(1, formats['no-attr-value'])]
    return [(len(m.group()), formats['string'])]
-def quoted_val(state, text, i, formats):
+def quoted_val(state, text, i, formats, user_data):
    ' A quoted attribute value '
    quote = '"' if state.parse is DQ_VAL else "'"
-    add_attr_data(state, ATTR_VALUE, ATTR_START, i)
+    add_attr_data(user_data, ATTR_VALUE, ATTR_START, i)
    pos = text.find(quote, i)
    if pos == -1:
        num = len(text) - i
    else:
        num = pos - i + 1
        state.parse = IN_OPENING_TAG
-        add_attr_data(state, ATTR_VALUE, ATTR_END, i + num)
+        add_attr_data(user_data, ATTR_VALUE, ATTR_END, i + num)
    return [(num, formats['string'])]
-def closing_tag(state, text, i, formats):
+def closing_tag(state, text, i, formats, user_data):
    ' A closing tag like </a> '
    ch = text[i]
    if ch in space_chars:
@ -399,10 +358,10 @@ def closing_tag(state, text, i, formats):
    ans = [(1, formats['end_tag'])]
    if num > 1:
        ans.insert(0, (num - 1, formats['bad-closing']))
-    add_tag_data(state, TagEnd(pos, False, False))
+    add_tag_data(user_data, TagEnd(pos, False, False))
    return ans
-def in_comment(state, text, i, formats):
+def in_comment(state, text, i, formats, user_data):
    ' Comment, processing instruction or doctype '
    end = {IN_COMMENT:'-->', IN_PI:'?>'}.get(state.parse, '>')
    pos = text.find(end, i)
@ -433,7 +392,7 @@ for x in (SQ_VAL, DQ_VAL):
 xml_state_map = state_map.copy()
 xml_state_map[IN_OPENING_TAG] = partial(opening_tag, set())
-def create_formats(highlighter):
+def create_formats(highlighter, add_css=True):
    t = highlighter.theme
    formats = {
        'tag': t['Function'],
@ -463,6 +422,8 @@ def create_formats(highlighter):
        f.setToolTip(msg)
    f = formats['title'] = SyntaxTextCharFormat()
    f.setFontWeight(QFont.Bold)
    if add_css:
        formats['css_sub_formats'] = create_css_formats(highlighter)
    return formats
@ -471,18 +432,7 @@ class HTMLHighlighter(SyntaxHighlighter):
    state_map = state_map
    create_formats_func = create_formats
    spell_attributes = ('alt', 'title')
-
+    user_data_factory = HTMLUserData
    def create_formats(self):
        super(HTMLHighlighter, self).create_formats()
        self.default_state = State()
        self.default_state.css_formats = create_css_formats(self)
        self.default_state.stack = Stack()
    def create_state(self, val):
        if val < 0:
            return self.default_state.copy()
        ans = self.default_state.stack.state_for(val) or self.default_state
        return ans.copy()
    def tag_ok_for_spell(self, name):
        return name not in html_spell_tags
@ -491,6 +441,7 @@ class XMLHighlighter(HTMLHighlighter):
    state_map = xml_state_map
    spell_attributes = ('opf:file-as',)
    create_formats_func = partial(create_formats, add_css=False)
    def tag_ok_for_spell(self, name):
        return name in xml_spell_tags
--- a/src/calibre/gui2/tweak_book/editor/text.py
+++ b/src/calibre/gui2/tweak_book/editor/text.py
@ -135,7 +135,7 @@ class TextEdit(PlainTextEdit):
        self.smarts = NullSmarts(self)
        self.current_cursor_line = None
        self.current_search_mark = None
-        self.highlighter = SyntaxHighlighter(self)
+        self.highlighter = SyntaxHighlighter()
        self.line_number_area = LineNumbers(self)
        self.apply_settings()
        self.setMouseTracking(True)
@ -206,9 +206,9 @@ class TextEdit(PlainTextEdit):
    def load_text(self, text, syntax='html', process_template=False):
        self.syntax = syntax
-        self.highlighter = get_highlighter(syntax)(self)
+        self.highlighter = get_highlighter(syntax)()
        self.highlighter.apply_theme(self.theme)
-        self.highlighter.setDocument(self.document())
+        self.highlighter.set_document(self.document())
        sclass = {'html':HTMLSmarts, 'xml':HTMLSmarts}.get(syntax, None)
        if sclass is not None:
            self.smarts = sclass(self)