Fix ICU find returning incorrect position and length parameters when non-BMP characters are present on wide python builds

2025-07-07 10:14:46 -04:00 · 2014-03-08 21:41:05 +05:30 · 2014-03-08 21:41:05 +05:30 · 4eaee89487
commit 4eaee89487
parent 27327e811b
2 changed files with 14 additions and 4 deletions
--- a/src/calibre/utils/icu.c
+++ b/src/calibre/utils/icu.c
@ -191,6 +191,9 @@ end:
 // Collator.find {{{
 static PyObject *
 icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
+#if PY_VERSION_HEX >= 0x03030000 
+#error Not implemented for python >= 3.3
+#endif
    PyObject *a_ = NULL, *b_ = NULL;
    UChar *a = NULL, *b = NULL;
    int32_t asz = 0, bsz = 0, pos = -1, length = -1;
@ -207,10 +210,16 @@ icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
    search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
    if (U_SUCCESS(status)) {
        pos = usearch_first(search, &status);
-        if (pos != USEARCH_DONE) 
+        if (pos != USEARCH_DONE) {
            length = usearch_getMatchedLength(search);
-        else
-            pos = -1;
+#ifdef Py_UNICODE_WIDE
+            // We have to return number of unicode characters since the string
+            // could contain surrogate pairs which are represented as a single
+            // character in python wide builds
+            length = u_countChar32(b + pos, length);
+            pos = u_countChar32(b, pos);
+#endif
+        } else pos = -1;
    }
 end:
    if (search != NULL) usearch_close(search);
--- a/src/calibre/utils/icu_test.py
+++ b/src/calibre/utils/icu_test.py
@ -92,7 +92,8 @@ class TestICU(unittest.TestCase):
    def test_find(self):
        ' Test searching for substrings '
        self.ae((1, 1), icu.find(b'a', b'1ab'))
-        self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x'))
+        self.ae((1, 1 if sys.maxunicode >= 0x10ffff else 2), icu.find('\U0001f431', 'x\U0001f431x'))
+        self.ae((1 if sys.maxunicode >= 0x10ffff else 2, 1), icu.find('y', '\U0001f431y'))
        self.ae((0, 4), icu.primary_find('pena', 'peña'))
        for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
            self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))