diff --git a/src/calibre/ebooks/epub/cfi/parse.py b/src/calibre/ebooks/epub/cfi/parse.py index cbcd3bf78b..5ffb9e55f9 100644 --- a/src/calibre/ebooks/epub/cfi/parse.py +++ b/src/calibre/ebooks/epub/cfi/parse.py @@ -5,11 +5,9 @@ __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -import regex, sys +import regex from polyglot.builtins import map, zip -is_narrow_build = sys.maxunicode < 0x10ffff - class Parser(object): @@ -21,10 +19,7 @@ class Parser(object): def __init__(self): # All allowed unicode characters + escaped special characters special_char = r'[\[\](),;=^]' - if is_narrow_build: - unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd]--%s]' % special_char - else: - unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char + unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char escaped_char = r'\^' + special_char chars = r'(?:%s|(?:%s))+' % (unescaped_char, escaped_char) chars_no_space = chars.replace('0020', '0021') diff --git a/src/calibre/gui2/tweak_book/editor/syntax/base.py b/src/calibre/gui2/tweak_book/editor/syntax/base.py index f8a73953b6..bae26c262c 100644 --- a/src/calibre/gui2/tweak_book/editor/syntax/base.py +++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py @@ -1,11 +1,9 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 - __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' -import sys from collections import defaultdict, deque from PyQt5.Qt import QTextCursor, QTextBlockUserData, QTextLayout, QTimer @@ -15,13 +13,11 @@ from calibre.gui2.tweak_book.widgets import BusyCursor from calibre.utils.icu import utf16_length from polyglot.builtins import iteritems, unicode_type -is_wide_build = sys.maxunicode >= 0x10ffff - def run_loop(user_data, state_map, formats, text): state = user_data.state i = 0 - fix_offsets = is_wide_build and utf16_length(text) != len(text) + fix_offsets = utf16_length(text) != len(text) seen_states = defaultdict(set) while i < len(text): orig_i = i diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index cc7082c17a..191056b9a1 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -212,13 +212,11 @@ icu_Collator_find(icu_Collator *self, PyObject *args) { pos = usearch_first(search, &status); if (pos != USEARCH_DONE) { length = usearch_getMatchedLength(search); -#ifdef Py_UNICODE_WIDE // We have to return number of unicode characters since the string // could contain surrogate pairs which are represented as a single // character in python wide builds length = u_countChar32(b + pos, length); pos = u_countChar32(b, pos); -#endif } else pos = -1; } end: @@ -637,9 +635,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { } } if (leading_hyphen && ans > -1) ans -= 1; -#ifdef Py_UNICODE_WIDE if (ans > 0) ans = u_countChar32(self->text, ans); -#endif Py_END_ALLOW_THREADS; end: @@ -723,9 +719,7 @@ do_split(icu_BreakIterator *self, int(*callback)(void*, int32_t, int32_t), void if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1; } last_pos = p; -#if defined(Py_UNICODE_WIDE) || PY_MAJOR_VERSION > 2 unicode_code_point_count(&count_start, &last_count, &last_count32, &word_start, &sz); -#endif if (is_hyphen_sep && found_one) { sz = last_sz + sz + trailing_hyphen; last_sz = sz; @@ -1166,22 +1160,6 @@ icu_string_length(PyObject *self, PyObject *src) { static PyObject * icu_utf16_length(PyObject *self, PyObject *src) { Py_ssize_t sz = 0; -#if PY_VERSION_HEX < 0x03030000 -#ifdef Py_UNICODE_WIDE - int32_t i = 0, t = 0; - Py_UNICODE *data = NULL; -#endif - - if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "Must be a unicode object"); return NULL; } - sz = PyUnicode_GET_SIZE(src); -#ifdef Py_UNICODE_WIDE - data = PyUnicode_AS_UNICODE(src); - for (i = 0; i < sz; i++) { - t += (data[i] > 0xffff) ? 2 : 1; - } - sz = t; -#endif -#else Py_ssize_t unit_length, i; Py_UCS4 *data = NULL; @@ -1197,7 +1175,6 @@ icu_utf16_length(PyObject *self, PyObject *src) { } } } -#endif return Py_BuildValue("n", sz); } // }}} diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index edb6e6c662..f42448e875 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -9,8 +9,6 @@ __docformat__ = 'restructuredtext en' import sys from polyglot.builtins import filter -is_narrow_build = sys.maxunicode < 0x10ffff - # Setup code {{{ import codecs @@ -299,9 +297,7 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x): c = icu_upper(key(item) or ' ') ordnum, ordlen = collation_order(c) if last_ordnum != ordnum: - if not is_narrow_build: - ordlen = 1 - last_c = c[0:ordlen] + last_c = c[0:1] last_ordnum = ordnum try: ans[last_c].append(item) @@ -311,10 +307,10 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x): # Return the number of unicode codepoints in a string -string_length = _icu.string_length if is_narrow_build else len +string_length = len # Return the number of UTF-16 codepoints in a string -utf16_length = len if is_narrow_build else _icu.utf16_length +utf16_length = _icu.utf16_length ################################################################################ diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h index 2834ad0327..b9fe97ed1e 100644 --- a/src/calibre/utils/icu_calibre_utils.h +++ b/src/calibre/utils/icu_calibre_utils.h @@ -22,100 +22,10 @@ #include #include -#if PY_VERSION_HEX < 0x03030000 && PY_VERSION_HEX > 0x03000000 -#error Not implemented for python 3.0 to 3.2 +#if PY_VERSION_HEX < 0x03030000 +#error Not implemented for python < 3.3 #endif -#if PY_VERSION_HEX < 0x03000000 -#define MIN(x, y) ((x)<(y)) ? (x) : (y) -#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff) -#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff) - -#ifndef NO_PYTHON_TO_ICU -static UChar* python_to_icu(PyObject *obj, int32_t *osz) { - UChar *ans = NULL; - Py_ssize_t sz = 0; -#ifdef Py_UNICODE_WIDE - UErrorCode status = U_ZERO_ERROR; -#endif - - if (!PyUnicode_CheckExact(obj)) { - PyErr_SetString(PyExc_TypeError, "Not a unicode string"); - goto end; - } - sz = PyUnicode_GET_SIZE(obj); - -#ifdef Py_UNICODE_WIDE -// wide build (UCS 4) - ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination - if (ans == NULL) { PyErr_NoMemory(); goto end; } - u_strFromUTF32WithSub(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, 0xfffd, NULL, &status); - if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; } -#else -// narrow build (UTF-16) - ans = (UChar*) malloc((sz + 1) * sizeof(UChar)); - if (ans == NULL) { PyErr_NoMemory(); goto end; } - for (Py_ssize_t i = 0; i < sz; i++) { - UChar ch = PyUnicode_AS_UNICODE(obj)[i]; - if (IS_HIGH_SURROGATE(ch)) { - if (i >= sz - 1 || !IS_LOW_SURROGATE(PyUnicode_AS_UNICODE(obj)[i+1])) ans[i] = 0xfffd; - else { ans[i] = ch; ans[i+1] = PyUnicode_AS_UNICODE(obj)[i+1]; i++; } - } else if (IS_LOW_SURROGATE(ch)) { - ans[i] = 0xfffd; - } else ans[i] = ch; - } - ans[sz] = 0; // Ensure null termination - if (osz != NULL) *osz = (int32_t)sz; -#endif -end: - return ans; -} - -#ifndef NO_PYTHON_TO_ICU32 -static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) { - UChar32 *ans = NULL; - Py_ssize_t sz = 0; -#ifndef Py_UNICODE_WIDE - UErrorCode status = U_ZERO_ERROR; -#endif - - if (!PyUnicode_CheckExact(obj)) { - PyErr_SetString(PyExc_TypeError, "Not a unicode string"); - goto end; - } - - sz = PyUnicode_GET_SIZE(obj); // number of UCS2 code-points in narrow build and UCS4 code-points in wide build - ans = (UChar32*) calloc(sz+1, sizeof(UChar32)); // Ensure null termination - if (ans == NULL) { PyErr_NoMemory(); goto end; } - -#ifdef Py_UNICODE_WIDE -// wide build (UCS 4) - memcpy(ans, PyUnicode_AS_DATA(obj), MIN((sizeof(UChar32)*(sz+1)),PyUnicode_GET_DATA_SIZE(obj))); - if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj); -#else -// narrow build (UTF-16) - u_strToUTF32(ans, (int32_t)sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status); - if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; } -#endif -end: - return ans; -} -#endif - -#endif - -#ifndef NO_ICU_TO_PYTHON -static PyObject* icu_to_python(UChar *src, int32_t sz) { -#ifdef Py_UNICODE_WIDE - return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL); -#else - return PyUnicode_FromUnicode((Py_UNICODE*)src, sz); -#endif -} -#endif - -#else // end PY2; start PY3.3+ - static UChar* python_to_icu(PyObject *obj, int32_t *osz) { UChar *ans = NULL; Py_ssize_t sz = 0; @@ -226,5 +136,3 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) { return PyUnicode_DecodeUTF16((char*) src, sz * sizeof(UChar), "replace", NULL); } #endif - -#endif // end PY3.3+ diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index ba87e155f7..2a4862b1d6 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -96,8 +96,8 @@ class TestICU(unittest.TestCase): def test_find(self): ' Test searching for substrings ' self.ae((1, 1), icu.find(b'a', b'1ab')) - self.ae((1, 1 if sys.maxunicode >= 0x10ffff else 2), icu.find('\U0001f431', 'x\U0001f431x')) - self.ae((1 if sys.maxunicode >= 0x10ffff else 2, 1), icu.find('y', '\U0001f431y')) + self.ae((1, 1), icu.find('\U0001f431', 'x\U0001f431x')) + self.ae((1, 1), icu.find('y', '\U0001f431y')) self.ae((0, 4), icu.primary_find('pena', 'peña')) for k, v in iteritems({u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}): self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k)) @@ -170,7 +170,7 @@ class TestICU(unittest.TestCase): self.ae(split(u'-one two-'), ['-one', 'two-']) self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e']) self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e']) - self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)]) + self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6, 5)]) self.ae(count_words('a b c d e f'), 6) for needle, haystack, pos in ( ('word', 'a word b', 2), @@ -189,7 +189,7 @@ class TestICU(unittest.TestCase): ('i', 'six i', 4), ('i', '', -1), ('', '', -1), ('', 'i', -1), ('i', 'six clicks', -1), - ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)), + ('i', '\U0001f431 i', 2), ('-a', 'b -a', 2), ('a-', 'a-b a- d', 4), ('-a-', 'b -a -a-', 5), diff --git a/src/calibre/utils/matcher.py b/src/calibre/utils/matcher.py index a91d5a6a39..dfca519ab7 100644 --- a/src/calibre/utils/matcher.py +++ b/src/calibre/utils/matcher.py @@ -310,7 +310,7 @@ def test(return_tests=False): m = Matcher([raw], scorer=CScorer) positions = next(itervalues(m(raw))) self.assertEqual( - positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3)) + positions, (0, 1, 2) ) if return_tests: @@ -325,14 +325,8 @@ def test(return_tests=False): TestRunner(verbosity=4) -if sys.maxunicode >= 0x10ffff: - get_char = lambda string, pos: string[pos] -else: - - def get_char(string, pos): - chs = 2 if ('\ud800' <= string[pos] <= '\udbff' - ) else 1 # UTF-16 surrogate pair in python narrow builds - return string[pos:pos + chs] +def get_char(string, pos): + return string[pos] def input_unicode(prompt):