Get rid of narrow build codepaths

2025-07-09 03:04:10 -04:00 · 2019-12-03 12:02:57 +05:30 · 2019-12-03 12:02:57 +05:30 · b2efbafc18
commit b2efbafc18
parent e52848671a
7 changed files with 15 additions and 149 deletions
--- a/src/calibre/ebooks/epub/cfi/parse.py
+++ b/src/calibre/ebooks/epub/cfi/parse.py
@ -5,11 +5,9 @@
 __license__ = 'GPL v3'
 __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'

-import regex, sys
+import regex
 from polyglot.builtins import map, zip

-is_narrow_build = sys.maxunicode < 0x10ffff
-

 class Parser(object):

@ -21,10 +19,7 @@ class Parser(object):
    def __init__(self):
        # All allowed unicode characters + escaped special characters
        special_char = r'[\[\](),;=^]'
-        if is_narrow_build:
-            unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd]--%s]' % special_char
-        else:
-            unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char
+        unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char
        escaped_char = r'\^' + special_char
        chars = r'(?:%s|(?:%s))+' % (unescaped_char, escaped_char)
        chars_no_space = chars.replace('0020', '0021')
--- a/src/calibre/gui2/tweak_book/editor/syntax/base.py
+++ b/src/calibre/gui2/tweak_book/editor/syntax/base.py
@ -1,11 +1,9 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8

-
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'

-import sys
 from collections import defaultdict, deque

 from PyQt5.Qt import QTextCursor, QTextBlockUserData, QTextLayout, QTimer
@ -15,13 +13,11 @@ from calibre.gui2.tweak_book.widgets import BusyCursor
 from calibre.utils.icu import utf16_length
 from polyglot.builtins import iteritems, unicode_type

-is_wide_build = sys.maxunicode >= 0x10ffff
-

 def run_loop(user_data, state_map, formats, text):
    state = user_data.state
    i = 0
-    fix_offsets = is_wide_build and utf16_length(text) != len(text)
+    fix_offsets = utf16_length(text) != len(text)
    seen_states = defaultdict(set)
    while i < len(text):
        orig_i = i
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -212,13 +212,11 @@ icu_Collator_find(icu_Collator *self, PyObject *args) {
        pos = usearch_first(search, &status);
        if (pos != USEARCH_DONE) {
            length = usearch_getMatchedLength(search);
-#ifdef Py_UNICODE_WIDE
            // We have to return number of unicode characters since the string
            // could contain surrogate pairs which are represented as a single
            // character in python wide builds
            length = u_countChar32(b + pos, length);
            pos = u_countChar32(b, pos);
-#endif
        } else pos = -1;
    }
 end:
@ -637,9 +635,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
        }
    }
    if (leading_hyphen && ans > -1) ans -= 1;
-#ifdef Py_UNICODE_WIDE
    if (ans > 0) ans = u_countChar32(self->text, ans);
-#endif
    Py_END_ALLOW_THREADS;

 end:
@ -723,9 +719,7 @@ do_split(icu_BreakIterator *self, int(*callback)(void*, int32_t, int32_t), void
                if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
            }
            last_pos = p;
-#if defined(Py_UNICODE_WIDE) || PY_MAJOR_VERSION > 2
 			unicode_code_point_count(&count_start, &last_count, &last_count32, &word_start, &sz);
-#endif
            if (is_hyphen_sep && found_one) {
                sz = last_sz + sz + trailing_hyphen;
                last_sz = sz;
@ -1166,22 +1160,6 @@ icu_string_length(PyObject *self, PyObject *src) {
 static PyObject *
 icu_utf16_length(PyObject *self, PyObject *src) {
    Py_ssize_t sz = 0;
-#if PY_VERSION_HEX < 0x03030000
-#ifdef Py_UNICODE_WIDE
-    int32_t i = 0, t = 0;
-    Py_UNICODE *data = NULL;
-#endif
-
-    if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "Must be a unicode object"); return NULL; }
-    sz = PyUnicode_GET_SIZE(src);
-#ifdef Py_UNICODE_WIDE
-    data = PyUnicode_AS_UNICODE(src);
-    for (i = 0; i < sz; i++) {
-        t += (data[i] > 0xffff) ? 2 : 1;
-    }
-    sz = t;
-#endif
-#else
    Py_ssize_t unit_length, i;
    Py_UCS4 *data = NULL;

@ -1197,7 +1175,6 @@ icu_utf16_length(PyObject *self, PyObject *src) {
            }
        }
    }
-#endif

    return Py_BuildValue("n", sz);
 } // }}}
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -9,8 +9,6 @@ __docformat__ = 'restructuredtext en'
 import sys
 from polyglot.builtins import filter

-is_narrow_build = sys.maxunicode < 0x10ffff
-
 # Setup code {{{
 import codecs

@ -299,9 +297,7 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):
        c = icu_upper(key(item) or ' ')
        ordnum, ordlen = collation_order(c)
        if last_ordnum != ordnum:
-            if not is_narrow_build:
-                ordlen = 1
-            last_c = c[0:ordlen]
+            last_c = c[0:1]
            last_ordnum = ordnum
        try:
            ans[last_c].append(item)
@ -311,10 +307,10 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):


 # Return the number of unicode codepoints in a string
-string_length = _icu.string_length if is_narrow_build else len
+string_length = len

 # Return the number of UTF-16 codepoints in a string
-utf16_length = len if is_narrow_build else _icu.utf16_length
+utf16_length = _icu.utf16_length

 ################################################################################

--- a/src/calibre/utils/icu_calibre_utils.h
+++ b/src/calibre/utils/icu_calibre_utils.h
@ -22,100 +22,10 @@
 #include <unicode/unorm2.h>
 #include <unicode/ubrk.h>

-#if PY_VERSION_HEX < 0x03030000 && PY_VERSION_HEX > 0x03000000
-#error Not implemented for python 3.0 to 3.2
+#if PY_VERSION_HEX < 0x03030000
+#error Not implemented for python < 3.3
 #endif

-#if PY_VERSION_HEX < 0x03000000
-#define MIN(x, y) ((x)<(y)) ? (x) : (y)
-#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff)
-#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff)
-
-#ifndef NO_PYTHON_TO_ICU
-static UChar* python_to_icu(PyObject *obj, int32_t *osz) {
-    UChar *ans = NULL;
-    Py_ssize_t sz = 0;
-#ifdef Py_UNICODE_WIDE
-    UErrorCode status = U_ZERO_ERROR;
-#endif
-
-    if (!PyUnicode_CheckExact(obj)) {
-        PyErr_SetString(PyExc_TypeError, "Not a unicode string");
-        goto end;
-    }
-    sz = PyUnicode_GET_SIZE(obj);
-
-#ifdef Py_UNICODE_WIDE
-// wide build (UCS 4)
-    ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination
-    if (ans == NULL) { PyErr_NoMemory(); goto end; }
-    u_strFromUTF32WithSub(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, 0xfffd, NULL, &status);
-    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
-#else
-// narrow build (UTF-16)
-    ans = (UChar*) malloc((sz + 1) * sizeof(UChar));
-    if (ans == NULL) { PyErr_NoMemory(); goto end; }
-    for (Py_ssize_t i = 0; i < sz; i++) {
-        UChar ch = PyUnicode_AS_UNICODE(obj)[i];
-        if (IS_HIGH_SURROGATE(ch)) {
-            if (i >= sz - 1 || !IS_LOW_SURROGATE(PyUnicode_AS_UNICODE(obj)[i+1])) ans[i] = 0xfffd;
-            else { ans[i] = ch; ans[i+1] = PyUnicode_AS_UNICODE(obj)[i+1]; i++; }
-        } else if (IS_LOW_SURROGATE(ch)) {
-            ans[i] = 0xfffd;
-        } else ans[i] = ch;
-    }
-    ans[sz] = 0; // Ensure null termination
-    if (osz != NULL) *osz = (int32_t)sz;
-#endif
-end:
-    return ans;
-}
-
-#ifndef NO_PYTHON_TO_ICU32
-static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) {
-    UChar32 *ans = NULL;
-    Py_ssize_t sz = 0;
-#ifndef Py_UNICODE_WIDE
-    UErrorCode status = U_ZERO_ERROR;
-#endif
-
-    if (!PyUnicode_CheckExact(obj)) {
-        PyErr_SetString(PyExc_TypeError, "Not a unicode string");
-        goto end;
-    }
-
-    sz = PyUnicode_GET_SIZE(obj);  // number of UCS2 code-points in narrow build and UCS4 code-points in wide build
-    ans = (UChar32*) calloc(sz+1, sizeof(UChar32));  // Ensure null termination
-    if (ans == NULL) { PyErr_NoMemory(); goto end; }
-
-#ifdef Py_UNICODE_WIDE
-// wide build (UCS 4)
-    memcpy(ans, PyUnicode_AS_DATA(obj), MIN((sizeof(UChar32)*(sz+1)),PyUnicode_GET_DATA_SIZE(obj)));
-    if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
-#else
-// narrow build (UTF-16)
-    u_strToUTF32(ans, (int32_t)sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status);
-    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
-#endif
-end:
-    return ans;
-}
-#endif
-
-#endif
-
-#ifndef NO_ICU_TO_PYTHON
-static PyObject* icu_to_python(UChar *src, int32_t sz) {
-#ifdef Py_UNICODE_WIDE
-    return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL);
-#else
-    return PyUnicode_FromUnicode((Py_UNICODE*)src, sz);
-#endif
-}
-#endif
-
-#else  // end PY2; start PY3.3+
-
 static UChar* python_to_icu(PyObject *obj, int32_t *osz) {
    UChar *ans = NULL;
    Py_ssize_t sz = 0;
@ -226,5 +136,3 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) {
    return PyUnicode_DecodeUTF16((char*) src, sz * sizeof(UChar), "replace", NULL);
 }
 #endif
-
-#endif  // end PY3.3+
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -96,8 +96,8 @@ class TestICU(unittest.TestCase):
    def test_find(self):
        ' Test searching for substrings '
        self.ae((1, 1), icu.find(b'a', b'1ab'))
-        self.ae((1, 1 if sys.maxunicode >= 0x10ffff else 2), icu.find('\U0001f431', 'x\U0001f431x'))
-        self.ae((1 if sys.maxunicode >= 0x10ffff else 2, 1), icu.find('y', '\U0001f431y'))
+        self.ae((1, 1), icu.find('\U0001f431', 'x\U0001f431x'))
+        self.ae((1, 1), icu.find('y', '\U0001f431y'))
        self.ae((0, 4), icu.primary_find('pena', 'peña'))
        for k, v in iteritems({u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}):
            self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))
@ -170,7 +170,7 @@ class TestICU(unittest.TestCase):
        self.ae(split(u'-one two-'), ['-one', 'two-'])
        self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
        self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
-        self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
+        self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6, 5)])
        self.ae(count_words('a b c d e f'), 6)
        for needle, haystack, pos in (
                ('word', 'a word b', 2),
@ -189,7 +189,7 @@ class TestICU(unittest.TestCase):
                ('i', 'six i', 4),
                ('i', '', -1), ('', '', -1), ('', 'i', -1),
                ('i', 'six clicks', -1),
-                ('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
+                ('i', '\U0001f431 i', 2),
                ('-a', 'b -a', 2),
                ('a-', 'a-b a- d', 4),
                ('-a-', 'b -a -a-', 5),
--- a/src/calibre/utils/matcher.py
+++ b/src/calibre/utils/matcher.py
@ -310,7 +310,7 @@ def test(return_tests=False):
            m = Matcher([raw], scorer=CScorer)
            positions = next(itervalues(m(raw)))
            self.assertEqual(
-                positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))
+                positions, (0, 1, 2)
            )

    if return_tests:
@ -325,14 +325,8 @@ def test(return_tests=False):
    TestRunner(verbosity=4)


-if sys.maxunicode >= 0x10ffff:
-    get_char = lambda string, pos: string[pos]
-else:
-
-    def get_char(string, pos):
-        chs = 2 if ('\ud800' <= string[pos] <= '\udbff'
-                    ) else 1  # UTF-16 surrogate pair in python narrow builds
-        return string[pos:pos + chs]
+def get_char(string, pos):
+    return string[pos]


 def input_unicode(prompt):