From d337debc923bd2d4d03d8ab2a87e49a247ff52f0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 May 2014 17:47:49 +0530 Subject: [PATCH] Edit book: Fix incorrect syntax highlighting on linux if the text contains non-BMP unicode characters. --- src/calibre/gui2/tweak_book/editor/syntax/base.py | 12 +++++++++++- src/calibre/gui2/tweak_book/editor/syntax/html.py | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/tweak_book/editor/syntax/base.py b/src/calibre/gui2/tweak_book/editor/syntax/base.py index 909f1ead34..191eb06eb2 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/base.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py @@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import, __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' +import sys from collections import defaultdict from PyQt4.Qt import ( @@ -13,10 +14,14 @@ from PyQt4.Qt import ( from ..themes import highlight_to_char_format from calibre.gui2.tweak_book.widgets import BusyCursor +from calibre.utils.icu import utf16_length + +is_wide_build = sys.maxunicode >= 0x10ffff def run_loop(user_data, state_map, formats, text): state = user_data.state i = 0 + fix_offsets = is_wide_build and utf16_length(text) != len(text) seen_states = defaultdict(set) while i < len(text): orig_i = i @@ -24,7 +29,12 @@ def run_loop(user_data, state_map, formats, text): fmt = state_map[state.parse](state, text, i, formats, user_data) for num, f in fmt: if num > 0: - yield i, num, f + if fix_offsets: + # We need to map offsets/lengths from UCS-4 to UTF-16 in + # which non-BMP characters are two code points wide + yield utf16_length(text[:i]), utf16_length(text[i:i+num]), f + else: + yield i, num, f i += num if orig_i == i and state.parse in seen_states[i]: # Something went wrong in the syntax highlighter diff --git a/src/calibre/gui2/tweak_book/editor/syntax/html.py b/src/calibre/gui2/tweak_book/editor/syntax/html.py index e506d4229b..9d92394e70 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/html.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/html.py @@ -486,6 +486,7 @@ if __name__ == '__main__':

Some\xa0words\xa0separated\xa0by\xa0non\u2011breaking\xa0spaces and non\u2011breaking hyphens.

+

Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431

''', path_is_raw=True)