Fix ICU find returning incorrect position and length parameters when non-BMP characters are present on wide python builds

This commit is contained in:
Kovid Goyal 2014-03-08 21:41:05 +05:30
parent 27327e811b
commit 4eaee89487
2 changed files with 14 additions and 4 deletions

View File

@ -191,6 +191,9 @@ end:
// Collator.find {{{
static PyObject *
icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
PyObject *a_ = NULL, *b_ = NULL;
UChar *a = NULL, *b = NULL;
int32_t asz = 0, bsz = 0, pos = -1, length = -1;
@ -207,10 +210,16 @@ icu_Collator_find(icu_Collator *self, PyObject *args, PyObject *kwargs) {
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
if (U_SUCCESS(status)) {
pos = usearch_first(search, &status);
if (pos != USEARCH_DONE)
if (pos != USEARCH_DONE) {
length = usearch_getMatchedLength(search);
else
pos = -1;
#ifdef Py_UNICODE_WIDE
// We have to return number of unicode characters since the string
// could contain surrogate pairs which are represented as a single
// character in python wide builds
length = u_countChar32(b + pos, length);
pos = u_countChar32(b, pos);
#endif
} else pos = -1;
}
end:
if (search != NULL) usearch_close(search);

View File

@ -92,7 +92,8 @@ class TestICU(unittest.TestCase):
def test_find(self):
' Test searching for substrings '
self.ae((1, 1), icu.find(b'a', b'1ab'))
self.ae((1, 2), icu.find('\U0001f431', 'x\U0001f431x'))
self.ae((1, 1 if sys.maxunicode >= 0x10ffff else 2), icu.find('\U0001f431', 'x\U0001f431x'))
self.ae((1 if sys.maxunicode >= 0x10ffff else 2, 1), icu.find('y', '\U0001f431y'))
self.ae((0, 4), icu.primary_find('pena', 'peña'))
for k, v in {u'pèché': u'peche', u'flüße':u'Flusse', u'Štepánek':u'ŠtepaneK'}.iteritems():
self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k))