mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
When converting python string to ICU strings replace invalid UTF-16 surrogate codepoints with the erplacement character. Needed because various ICU functions fail if invalid surrogates are present. Fixes #1713892 [calibredb add cannot avoid duplicates](https://bugs.launchpad.net/calibre/+bug/1713892)
This commit is contained in:
parent
684bbe6dab
commit
2723d9f2cf
@ -27,6 +27,8 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MIN(x, y) ((x)<(y)) ? (x) : (y)
|
#define MIN(x, y) ((x)<(y)) ? (x) : (y)
|
||||||
|
#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff)
|
||||||
|
#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff)
|
||||||
|
|
||||||
// Roundtripping will need to be implemented differently for python 3.3+ where strings are stored with variable widths
|
// Roundtripping will need to be implemented differently for python 3.3+ where strings are stored with variable widths
|
||||||
|
|
||||||
@ -42,21 +44,29 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
|
|||||||
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
|
sz = PyUnicode_GET_SIZE(obj);
|
||||||
|
|
||||||
#ifdef Py_UNICODE_WIDE
|
#ifdef Py_UNICODE_WIDE
|
||||||
// wide build (UCS 4)
|
// wide build (UCS 4)
|
||||||
sz = PyUnicode_GET_SIZE(obj);
|
|
||||||
ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination
|
ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination
|
||||||
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
u_strFromUTF32(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status);
|
u_strFromUTF32WithSub(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, 0xfffd, NULL, &status);
|
||||||
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
|
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
|
||||||
#else
|
#else
|
||||||
// narrow build (UTF-16)
|
// narrow build (UTF-16)
|
||||||
sz = PyUnicode_GET_DATA_SIZE(obj);
|
ans = (UChar*) malloc((sz + 1) * sizeof(UChar));
|
||||||
ans = (UChar*) calloc(sz+2, 1); // Ensure null termination
|
|
||||||
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
if (ans == NULL) { PyErr_NoMemory(); goto end; }
|
||||||
memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
|
for (Py_ssize_t i = 0; i < sz; i++) {
|
||||||
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
|
UChar ch = PyUnicode_AS_UNICODE(obj)[i];
|
||||||
|
if (IS_HIGH_SURROGATE(ch)) {
|
||||||
|
if (i >= sz - 1 || !IS_LOW_SURROGATE(PyUnicode_AS_UNICODE(obj)[i+1])) ans[i] = 0xfffd;
|
||||||
|
else { ans[i] = ch; ans[i+1] = PyUnicode_AS_UNICODE(obj)[i+1]; i++; }
|
||||||
|
} else if (IS_LOW_SURROGATE(ch)) {
|
||||||
|
ans[i] = 0xfffd;
|
||||||
|
} else ans[i] = ch;
|
||||||
|
}
|
||||||
|
ans[sz] = 0; // Ensure null termination
|
||||||
|
if (osz != NULL) *osz = (int32_t)sz;
|
||||||
#endif
|
#endif
|
||||||
end:
|
end:
|
||||||
return ans;
|
return ans;
|
||||||
@ -104,5 +114,3 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -133,6 +133,8 @@ class TestICU(unittest.TestCase):
|
|||||||
' Test roundtripping '
|
' Test roundtripping '
|
||||||
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
|
||||||
self.ae(r, icu._icu.roundtrip(r))
|
self.ae(r, icu._icu.roundtrip(r))
|
||||||
|
self.ae(icu._icu.roundtrip('\ud8e81'), '\ufffd1')
|
||||||
|
self.ae(icu._icu.roundtrip('\udc01\ud8e8'), '\ufffd\ufffd')
|
||||||
for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]:
|
for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]:
|
||||||
self.ae(icu._icu.string_length(x), l)
|
self.ae(icu._icu.string_length(x), l)
|
||||||
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
|
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
|
||||||
@ -218,6 +220,6 @@ def test_build():
|
|||||||
if not result.wasSuccessful():
|
if not result.wasSuccessful():
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
run(verbosity=4)
|
run(verbosity=4)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user