mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Get rid of narrow build codepaths
This commit is contained in:
parent
e52848671a
commit
b2efbafc18
@ -5,11 +5,9 @@
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import regex, sys
|
||||
import regex
|
||||
from polyglot.builtins import map, zip
|
||||
|
||||
is_narrow_build = sys.maxunicode < 0x10ffff
|
||||
|
||||
|
||||
class Parser(object):
|
||||
|
||||
@ -21,10 +19,7 @@ class Parser(object):
|
||||
def __init__(self):
|
||||
# All allowed unicode characters + escaped special characters
|
||||
special_char = r'[\[\](),;=^]'
|
||||
if is_narrow_build:
|
||||
unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd]--%s]' % special_char
|
||||
else:
|
||||
unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char
|
||||
unescaped_char = '[[\t\n\r -\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]--%s]' % special_char
|
||||
escaped_char = r'\^' + special_char
|
||||
chars = r'(?:%s|(?:%s))+' % (unescaped_char, escaped_char)
|
||||
chars_no_space = chars.replace('0020', '0021')
|
||||
|
@ -1,11 +1,9 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=utf-8
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import sys
|
||||
from collections import defaultdict, deque
|
||||
|
||||
from PyQt5.Qt import QTextCursor, QTextBlockUserData, QTextLayout, QTimer
|
||||
@ -15,13 +13,11 @@ from calibre.gui2.tweak_book.widgets import BusyCursor
|
||||
from calibre.utils.icu import utf16_length
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
is_wide_build = sys.maxunicode >= 0x10ffff
|
||||
|
||||
|
||||
def run_loop(user_data, state_map, formats, text):
|
||||
state = user_data.state
|
||||
i = 0
|
||||
fix_offsets = is_wide_build and utf16_length(text) != len(text)
|
||||
fix_offsets = utf16_length(text) != len(text)
|
||||
seen_states = defaultdict(set)
|
||||
while i < len(text):
|
||||
orig_i = i
|
||||
|
@ -212,13 +212,11 @@ icu_Collator_find(icu_Collator *self, PyObject *args) {
|
||||
pos = usearch_first(search, &status);
|
||||
if (pos != USEARCH_DONE) {
|
||||
length = usearch_getMatchedLength(search);
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
// We have to return number of unicode characters since the string
|
||||
// could contain surrogate pairs which are represented as a single
|
||||
// character in python wide builds
|
||||
length = u_countChar32(b + pos, length);
|
||||
pos = u_countChar32(b, pos);
|
||||
#endif
|
||||
} else pos = -1;
|
||||
}
|
||||
end:
|
||||
@ -637,9 +635,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
|
||||
}
|
||||
}
|
||||
if (leading_hyphen && ans > -1) ans -= 1;
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
if (ans > 0) ans = u_countChar32(self->text, ans);
|
||||
#endif
|
||||
Py_END_ALLOW_THREADS;
|
||||
|
||||
end:
|
||||
@ -723,9 +719,7 @@ do_split(icu_BreakIterator *self, int(*callback)(void*, int32_t, int32_t), void
|
||||
if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
|
||||
}
|
||||
last_pos = p;
|
||||
#if defined(Py_UNICODE_WIDE) || PY_MAJOR_VERSION > 2
|
||||
unicode_code_point_count(&count_start, &last_count, &last_count32, &word_start, &sz);
|
||||
#endif
|
||||
if (is_hyphen_sep && found_one) {
|
||||
sz = last_sz + sz + trailing_hyphen;
|
||||
last_sz = sz;
|
||||
@ -1166,22 +1160,6 @@ icu_string_length(PyObject *self, PyObject *src) {
|
||||
static PyObject *
|
||||
icu_utf16_length(PyObject *self, PyObject *src) {
|
||||
Py_ssize_t sz = 0;
|
||||
#if PY_VERSION_HEX < 0x03030000
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
int32_t i = 0, t = 0;
|
||||
Py_UNICODE *data = NULL;
|
||||
#endif
|
||||
|
||||
if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "Must be a unicode object"); return NULL; }
|
||||
sz = PyUnicode_GET_SIZE(src);
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
data = PyUnicode_AS_UNICODE(src);
|
||||
for (i = 0; i < sz; i++) {
|
||||
t += (data[i] > 0xffff) ? 2 : 1;
|
||||
}
|
||||
sz = t;
|
||||
#endif
|
||||
#else
|
||||
Py_ssize_t unit_length, i;
|
||||
Py_UCS4 *data = NULL;
|
||||
|
||||
@ -1197,7 +1175,6 @@ icu_utf16_length(PyObject *self, PyObject *src) {
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return Py_BuildValue("n", sz);
|
||||
} // }}}
|
||||
|
@ -9,8 +9,6 @@ __docformat__ = 'restructuredtext en'
|
||||
import sys
|
||||
from polyglot.builtins import filter
|
||||
|
||||
is_narrow_build = sys.maxunicode < 0x10ffff
|
||||
|
||||
# Setup code {{{
|
||||
import codecs
|
||||
|
||||
@ -299,9 +297,7 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):
|
||||
c = icu_upper(key(item) or ' ')
|
||||
ordnum, ordlen = collation_order(c)
|
||||
if last_ordnum != ordnum:
|
||||
if not is_narrow_build:
|
||||
ordlen = 1
|
||||
last_c = c[0:ordlen]
|
||||
last_c = c[0:1]
|
||||
last_ordnum = ordnum
|
||||
try:
|
||||
ans[last_c].append(item)
|
||||
@ -311,10 +307,10 @@ def partition_by_first_letter(items, reverse=False, key=lambda x:x):
|
||||
|
||||
|
||||
# Return the number of unicode codepoints in a string
|
||||
string_length = _icu.string_length if is_narrow_build else len
|
||||
string_length = len
|
||||
|
||||
# Return the number of UTF-16 codepoints in a string
|
||||
utf16_length = len if is_narrow_build else _icu.utf16_length
|
||||
utf16_length = _icu.utf16_length
|
||||
|
||||
################################################################################
|
||||
|
||||
|
@ -22,100 +22,10 @@
|
||||
#include <unicode/unorm2.h>
|
||||
#include <unicode/ubrk.h>
|
||||
|
||||
#if PY_VERSION_HEX < 0x03030000 && PY_VERSION_HEX > 0x03000000
|
||||
#error Not implemented for python 3.0 to 3.2
|
||||
#if PY_VERSION_HEX < 0x03030000
|
||||
#error Not implemented for python < 3.3
|
||||
#endif
|
||||
|
||||
#if PY_VERSION_HEX < 0x03000000
|
||||
#define MIN(x, y) ((x)<(y)) ? (x) : (y)
|
||||
#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff)
|
||||
#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff)
|
||||
|
||||
#ifndef NO_PYTHON_TO_ICU
|
||||
static UChar* python_to_icu(PyObject *obj, int32_t *osz) {
|
||||
UChar *ans = NULL;
|
||||
Py_ssize_t sz = 0;
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
#endif
|
||||
|
||||
if (!PyUnicode_CheckExact(obj)) {
|
||||
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
||||
goto end;
|
||||
}
|
||||
sz = PyUnicode_GET_SIZE(obj);
|
||||
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
// wide build (UCS 4)
|
||||
ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination
|
||||
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||
u_strFromUTF32WithSub(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, 0xfffd, NULL, &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
|
||||
#else
|
||||
// narrow build (UTF-16)
|
||||
ans = (UChar*) malloc((sz + 1) * sizeof(UChar));
|
||||
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||
for (Py_ssize_t i = 0; i < sz; i++) {
|
||||
UChar ch = PyUnicode_AS_UNICODE(obj)[i];
|
||||
if (IS_HIGH_SURROGATE(ch)) {
|
||||
if (i >= sz - 1 || !IS_LOW_SURROGATE(PyUnicode_AS_UNICODE(obj)[i+1])) ans[i] = 0xfffd;
|
||||
else { ans[i] = ch; ans[i+1] = PyUnicode_AS_UNICODE(obj)[i+1]; i++; }
|
||||
} else if (IS_LOW_SURROGATE(ch)) {
|
||||
ans[i] = 0xfffd;
|
||||
} else ans[i] = ch;
|
||||
}
|
||||
ans[sz] = 0; // Ensure null termination
|
||||
if (osz != NULL) *osz = (int32_t)sz;
|
||||
#endif
|
||||
end:
|
||||
return ans;
|
||||
}
|
||||
|
||||
#ifndef NO_PYTHON_TO_ICU32
|
||||
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) {
|
||||
UChar32 *ans = NULL;
|
||||
Py_ssize_t sz = 0;
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
#endif
|
||||
|
||||
if (!PyUnicode_CheckExact(obj)) {
|
||||
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
||||
goto end;
|
||||
}
|
||||
|
||||
sz = PyUnicode_GET_SIZE(obj); // number of UCS2 code-points in narrow build and UCS4 code-points in wide build
|
||||
ans = (UChar32*) calloc(sz+1, sizeof(UChar32)); // Ensure null termination
|
||||
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
// wide build (UCS 4)
|
||||
memcpy(ans, PyUnicode_AS_DATA(obj), MIN((sizeof(UChar32)*(sz+1)),PyUnicode_GET_DATA_SIZE(obj)));
|
||||
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
|
||||
#else
|
||||
// narrow build (UTF-16)
|
||||
u_strToUTF32(ans, (int32_t)sz + 1, osz, (UChar*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status);
|
||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
|
||||
#endif
|
||||
end:
|
||||
return ans;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef NO_ICU_TO_PYTHON
|
||||
static PyObject* icu_to_python(UChar *src, int32_t sz) {
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
return PyUnicode_DecodeUTF16((char*)src, sz*sizeof(UChar), "strict", NULL);
|
||||
#else
|
||||
return PyUnicode_FromUnicode((Py_UNICODE*)src, sz);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#else // end PY2; start PY3.3+
|
||||
|
||||
static UChar* python_to_icu(PyObject *obj, int32_t *osz) {
|
||||
UChar *ans = NULL;
|
||||
Py_ssize_t sz = 0;
|
||||
@ -226,5 +136,3 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) {
|
||||
return PyUnicode_DecodeUTF16((char*) src, sz * sizeof(UChar), "replace", NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // end PY3.3+
|
||||
|
@ -96,8 +96,8 @@ class TestICU(unittest.TestCase):
|
||||
def test_find(self):
|
||||
' Test searching for substrings '
|
||||
self.ae((1, 1), icu.find(b'a', b'1ab'))
|
||||
self.ae((1, 1 if sys.maxunicode >= 0x10ffff else 2), icu.find('\U0001f431', 'x\U0001f431x'))
|
||||
self.ae((1 if sys.maxunicode >= 0x10ffff else 2, 1), icu.find('y', '\U0001f431y'))
|
||||
self.ae((1, 1), icu.find('\U0001f431', 'x\U0001f431x'))
|
||||
self.ae((1, 1), icu.find('y', '\U0001f431y'))
|
||||
self.ae((0, 4), icu.primary_find('pena', 'peña'))
|
||||
for k, v in iteritems({u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}):
|
||||
self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))
|
||||
@ -170,7 +170,7 @@ class TestICU(unittest.TestCase):
|
||||
self.ae(split(u'-one two-'), ['-one', 'two-'])
|
||||
self.ae(split(u'-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e'])
|
||||
self.ae(split(u'-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e'])
|
||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (7 if icu.is_narrow_build else 6, 5)])
|
||||
self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6, 5)])
|
||||
self.ae(count_words('a b c d e f'), 6)
|
||||
for needle, haystack, pos in (
|
||||
('word', 'a word b', 2),
|
||||
@ -189,7 +189,7 @@ class TestICU(unittest.TestCase):
|
||||
('i', 'six i', 4),
|
||||
('i', '', -1), ('', '', -1), ('', 'i', -1),
|
||||
('i', 'six clicks', -1),
|
||||
('i', '\U0001f431 i', (3 if icu.is_narrow_build else 2)),
|
||||
('i', '\U0001f431 i', 2),
|
||||
('-a', 'b -a', 2),
|
||||
('a-', 'a-b a- d', 4),
|
||||
('-a-', 'b -a -a-', 5),
|
||||
|
@ -310,7 +310,7 @@ def test(return_tests=False):
|
||||
m = Matcher([raw], scorer=CScorer)
|
||||
positions = next(itervalues(m(raw)))
|
||||
self.assertEqual(
|
||||
positions, (0, 1, (2 if sys.maxunicode >= 0x10ffff else 3))
|
||||
positions, (0, 1, 2)
|
||||
)
|
||||
|
||||
if return_tests:
|
||||
@ -325,14 +325,8 @@ def test(return_tests=False):
|
||||
TestRunner(verbosity=4)
|
||||
|
||||
|
||||
if sys.maxunicode >= 0x10ffff:
|
||||
get_char = lambda string, pos: string[pos]
|
||||
else:
|
||||
|
||||
def get_char(string, pos):
|
||||
chs = 2 if ('\ud800' <= string[pos] <= '\udbff'
|
||||
) else 1 # UTF-16 surrogate pair in python narrow builds
|
||||
return string[pos:pos + chs]
|
||||
def get_char(string, pos):
|
||||
return string[pos]
|
||||
|
||||
|
||||
def input_unicode(prompt):
|
||||
|
Loading…
x
Reference in New Issue
Block a user