diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index d556115c45..34649afa3f 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -191,6 +191,9 @@ end: // Collator.find {{{ static PyObject * icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif PyObject *a_ = NULL, *b_ = NULL; UChar *a = NULL, *b = NULL; int32_t asz = 0, bsz = 0, pos = -1, length = -1; @@ -207,10 +210,16 @@ icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) { search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); if (U_SUCCESS(status)) { pos = usearch_first(search, &status); - if (pos != USEARCH_DONE) + if (pos != USEARCH_DONE) { length = usearch_getMatchedLength(search); - else - pos = -1; +#ifdef Py_UNICODE_WIDE + // We have to return number of unicode characters since the string + // could contain surrogate pairs which are represented as a single + // character in python wide builds + length = u_countChar32(b + pos, length); + pos = u_countChar32(b, pos); +#endif + } else pos = -1; } end: if (search != NULL) usearch_close(search); diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index e96397e86a..d6d5f557f4 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -92,7 +92,8 @@ class TestICU(unittest.TestCase): def test_find(self): ' Test searching for substrings ' self.ae((1, 1), icu.find(b'a', b'1ab')) - self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x')) + self.ae((1, 1 if sys.maxunicode >= 0x10ffff else 2), icu.find('\U0001f431', 'x\U0001f431x')) + self.ae((1 if sys.maxunicode >= 0x10ffff else 2, 1), icu.find('y', '\U0001f431y')) self.ae((0, 4), icu.primary_find('pena', 'peña')) for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems(): self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))