Edit book: Fix incorrect syntax highlighting on linux if the text contains non-BMP unicode characters.

This commit is contained in:
Kovid Goyal 2014-05-12 17:47:49 +05:30
parent 7ee75a8775
commit d337debc92
2 changed files with 12 additions and 1 deletions

View File

@ -6,6 +6,7 @@ from __future__ import (unicode_literals, division, absolute_import,
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys
from collections import defaultdict from collections import defaultdict
from PyQt4.Qt import ( from PyQt4.Qt import (
@ -13,10 +14,14 @@ from PyQt4.Qt import (
from ..themes import highlight_to_char_format from ..themes import highlight_to_char_format
from calibre.gui2.tweak_book.widgets import BusyCursor from calibre.gui2.tweak_book.widgets import BusyCursor
from calibre.utils.icu import utf16_length
is_wide_build = sys.maxunicode >= 0x10ffff
def run_loop(user_data, state_map, formats, text): def run_loop(user_data, state_map, formats, text):
state = user_data.state state = user_data.state
i = 0 i = 0
fix_offsets = is_wide_build and utf16_length(text) != len(text)
seen_states = defaultdict(set) seen_states = defaultdict(set)
while i < len(text): while i < len(text):
orig_i = i orig_i = i
@ -24,7 +29,12 @@ def run_loop(user_data, state_map, formats, text):
fmt = state_map[state.parse](state, text, i, formats, user_data) fmt = state_map[state.parse](state, text, i, formats, user_data)
for num, f in fmt: for num, f in fmt:
if num > 0: if num > 0:
yield i, num, f if fix_offsets:
# We need to map offsets/lengths from UCS-4 to UTF-16 in
# which non-BMP characters are two code points wide
yield utf16_length(text[:i]), utf16_length(text[i:i+num]), f
else:
yield i, num, f
i += num i += num
if orig_i == i and state.parse in seen_states[i]: if orig_i == i and state.parse in seen_states[i]:
# Something went wrong in the syntax highlighter # Something went wrong in the syntax highlighter

View File

@ -486,6 +486,7 @@ if __name__ == '__main__':
<input disabled><input disabled /><span attr=<></span> <input disabled><input disabled /><span attr=<></span>
<!-- Non-breaking spaces are rendered differently from normal spaces, so that they stand out --> <!-- Non-breaking spaces are rendered differently from normal spaces, so that they stand out -->
<p>Some\xa0words\xa0separated\xa0by\xa0non\u2011breaking\xa0spaces and non\u2011breaking hyphens.</p> <p>Some\xa0words\xa0separated\xa0by\xa0non\u2011breaking\xa0spaces and non\u2011breaking hyphens.</p>
<p>Some non-BMP unicode text:\U0001f431\U0001f431\U0001f431</p>
</body> </body>
</html> </html>
''', path_is_raw=True) ''', path_is_raw=True)