Edit book: Fix incorrect syntax highlighting on linux if the text contains non-BMP unicode characters.

2025-07-09 03:04:10 -04:00 · 2014-05-12 17:47:49 +05:30 · 2014-05-12 17:47:49 +05:30 · d337debc92
commit d337debc92
parent 7ee75a8775
2 changed files with 12 additions and 1 deletions
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 import sys
 from collections import defaultdict
 from PyQt4.Qt import (
@ -13,10 +14,14 @@ from PyQt4.Qt import (
 from ..themes import highlight_to_char_format
 from calibre.gui2.tweak_book.widgets import BusyCursor
 from calibre.utils.icu import utf16_length
 is_wide_build = sys.maxunicode >= 0x10ffff
 def run_loop(user_data, state_map, formats, text):
    state = user_data.state
    i = 0
    fix_offsets = is_wide_build and utf16_length(text) != len(text)
    seen_states = defaultdict(set)
    while i < len(text):
        orig_i = i
@ -24,7 +29,12 @@ def run_loop(user_data, state_map, formats, text):
        fmt = state_map[state.parse](state, text, i, formats, user_data)
        for num, f in fmt:
            if num > 0:
-                yield i, num, f
+                if fix_offsets:
                    # We need to map offsets/lengths from UCS-4 to UTF-16 in
                    # which non-BMP characters are two code points wide
                    yield utf16_length(text[:i]), utf16_length(text[i:i+num]), f
                else:
                    yield i, num, f
                i += num
        if orig_i == i and state.parse in seen_states[i]:
            # Something went wrong in the syntax highlighter
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py
@ -486,6 +486,7 @@ if __name__ == '__main__':
        <input disabled><input disabled /><span attr=<></span>
        <!-- Non-breaking spaces are rendered differently from normal spaces, so that they stand out -->
        <p>Some\xa0words\xa0separated\xa0by\xa0non\u2011breaking\xa0spaces and non\u2011breaking hyphens.</p>
        <p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p>
    </body>
 </html>
 ''', path_is_raw=True)