From a3d62043041f753f4b3a93fc5db359103eb7a7ce Mon Sep 17 00:00:00 2001 From: Flaviu Tamas Date: Wed, 10 Oct 2018 22:19:36 -0400 Subject: [PATCH] Build matcher in py3 I'm not sure how this ever worked without PyTypeObject.tp_new, but adding a generic constructor seems to work fine. --- src/calibre/utils/icu.c | 38 +++---- src/calibre/utils/icu_calibre_utils.h | 120 +++++++++++++++++++-- src/calibre/utils/matcher.c | 149 ++++++++++++++------------ 3 files changed, 215 insertions(+), 92 deletions(-) diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index 5e7ae07ce2..81141a6e18 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -145,7 +145,7 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *input) { uint8_t *buf2 = NULL; PyObject *ans = NULL; - buf = python_to_icu(input, &sz, 1); + buf = python_to_icu(input, &sz); if (buf == NULL) return NULL; bsz = 7 * sz + 1; @@ -176,9 +176,9 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args) { if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; - a = python_to_icu(a_, &asz, 1); + a = python_to_icu(a_, &asz); if (a == NULL) goto end; - b = python_to_icu(b_, &bsz, 1); + b = python_to_icu(b_, &bsz); if (b == NULL) goto end; res = ucol_strcoll(self->collator, a, asz, b, bsz); end: @@ -202,9 +202,9 @@ icu_Collator_find(icu_Collator *self, PyObject *args) { if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; - a = python_to_icu(a_, &asz, 1); + a = python_to_icu(a_, &asz); if (a == NULL) goto end; - b = python_to_icu(b_, &bsz, 1); + b = python_to_icu(b_, &bsz); if (b == NULL) goto end; search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); @@ -241,10 +241,10 @@ icu_Collator_contains(icu_Collator *self, PyObject *args) { if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; - a = python_to_icu(a_, &asz, 1); + a = python_to_icu(a_, &asz); if (a == NULL) goto end; if (asz == 0) { found = TRUE; goto end; } - b = python_to_icu(b_, &bsz, 1); + b = python_to_icu(b_, &bsz); if (b == NULL) goto end; search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status); @@ -313,9 +313,9 @@ icu_Collator_startswith(icu_Collator *self, PyObject *args) { if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL; - a = python_to_icu(a_, &asz, 1); + a = python_to_icu(a_, &asz); if (a == NULL) goto end; - b = python_to_icu(b_, &bsz, 1); + b = python_to_icu(b_, &bsz); if (b == NULL) goto end; if (asz < bsz) goto end; @@ -341,7 +341,7 @@ icu_Collator_collation_order(icu_Collator *self, PyObject *a_) { UCollationElements *iter = NULL; int order = 0, len = -1; - a = python_to_icu(a_, &asz, 1); + a = python_to_icu(a_, &asz); if (a == NULL) goto end; iter = ucol_openElements(self->collator, a, asz, &status); @@ -578,7 +578,7 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) { UChar *buf = NULL; UErrorCode status = U_ZERO_ERROR; - buf = python_to_icu(input, &sz, 1); + buf = python_to_icu(input, &sz); if (buf == NULL) return NULL; ubrk_setText(self->break_iterator, buf, sz, &status); if (U_FAILURE(status)) { @@ -602,7 +602,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { UChar *buf = NULL, *needle = NULL; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; - buf = python_to_icu(token, &sz, 1); + buf = python_to_icu(token, &sz); if (buf == NULL) return NULL; if (sz < 1) goto end; needle = buf; @@ -796,7 +796,7 @@ static PyObject* icu_change_case(PyObject *self, PyObject *args) { return NULL; } - input_buf = python_to_icu(input, &sz, 1); + input_buf = python_to_icu(input, &sz); if (input_buf == NULL) goto end; output_buf = (UChar*) calloc(3 * sz, sizeof(UChar)); if (output_buf == NULL) { PyErr_NoMemory(); goto end; } @@ -830,7 +830,7 @@ static PyObject* icu_swap_case(PyObject *self, PyObject *input) { UChar32 *buf = NULL; int32_t sz = 0, sz32 = 0, i = 0; - input_buf = python_to_icu(input, &sz, 1); + input_buf = python_to_icu(input, &sz); if (input_buf == NULL) goto end; output_buf = (UChar*) calloc(3 * sz, sizeof(UChar)); buf = (UChar32*) calloc(2 * sz, sizeof(UChar32)); @@ -922,7 +922,7 @@ icu_character_name(PyObject *self, PyObject *args) { if (!PyArg_ParseTuple(args, "O|O", &input, &palias)) return NULL; if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; - buf = python_to_icu(input, &sz, 1); + buf = python_to_icu(input, &sz); if (buf == NULL) goto end; U16_GET(buf, 0, 0, sz, code); if (alias) { @@ -984,7 +984,7 @@ icu_ord_string(PyObject *self, PyObject *input) { int32_t sz = 0, i = 0; PyObject *ans = NULL, *temp = NULL; - input_buf = python_to_icu32(input, &sz, 1); + input_buf = python_to_icu32(input, &sz); if (input_buf == NULL) goto end; ans = PyTuple_New(sz); if (ans == NULL) goto end; @@ -1031,7 +1031,7 @@ icu_normalize(PyObject *self, PyObject *args) { goto end; } - source = python_to_icu(src, &sz, 1); + source = python_to_icu(src, &sz); if (source == NULL) goto end; cap = 2 * sz; dest = (UChar*) calloc(cap, sizeof(UChar)); @@ -1069,7 +1069,7 @@ icu_roundtrip(PyObject *self, PyObject *src) { UChar *icu = NULL; PyObject *ret = NULL; - icu = python_to_icu(src, &sz, 1); + icu = python_to_icu(src, &sz); if (icu != NULL) { ret = icu_to_python(icu, sz); free(icu); @@ -1105,7 +1105,7 @@ icu_string_length(PyObject *self, PyObject *src) { int32_t sz = 0; UChar *icu = NULL; - icu = python_to_icu(src, &sz, 1); + icu = python_to_icu(src, &sz); if (icu == NULL) return NULL; sz = u_countChar32(icu, sz); free(icu); diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h index 25164283fc..01fe999073 100644 --- a/src/calibre/utils/icu_calibre_utils.h +++ b/src/calibre/utils/icu_calibre_utils.h @@ -22,10 +22,11 @@ #include #include -#if PY_VERSION_HEX >= 0x03030000 -#error Not implemented for python >= 3.3 +#if PY_VERSION_HEX < 0x03030000 && PY_VERSION_HEX > 0x03000000 +#error Not implemented for python 3.0 to 3.2 #endif +#if PY_VERSION_HEX < 0x03000000 #define MIN(x, y) ((x)<(y)) ? (x) : (y) #define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff) #define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff) @@ -33,14 +34,14 @@ // Roundtripping will need to be implemented differently for python 3.3+ where strings are stored with variable widths #ifndef NO_PYTHON_TO_ICU -static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) { +static UChar* python_to_icu(PyObject *obj, int32_t *osz) { UChar *ans = NULL; Py_ssize_t sz = 0; #ifdef Py_UNICODE_WIDE UErrorCode status = U_ZERO_ERROR; #endif - if (do_check && !PyUnicode_CheckExact(obj)) { + if (!PyUnicode_CheckExact(obj)) { PyErr_SetString(PyExc_TypeError, "Not a unicode string"); goto end; } @@ -73,14 +74,14 @@ end: } #ifndef NO_PYTHON_TO_ICU32 -static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) { +static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) { UChar32 *ans = NULL; Py_ssize_t sz = 0; #ifndef Py_UNICODE_WIDE UErrorCode status = U_ZERO_ERROR; #endif - if (do_check && !PyUnicode_CheckExact(obj)) { + if (!PyUnicode_CheckExact(obj)) { PyErr_SetString(PyExc_TypeError, "Not a unicode string"); goto end; } @@ -114,3 +115,110 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) { #endif } #endif + +#else // end PY2; start PY3.3+ + +static UChar* python_to_icu(PyObject *obj, int32_t *osz) { + UChar *ans = NULL; + Py_ssize_t sz = 0; + UErrorCode status = U_ZERO_ERROR; + int i; + + if (!PyUnicode_CheckExact(obj)) { + PyErr_SetString(PyExc_TypeError, "Not a unicode string"); + return NULL; + } + if(PyUnicode_READY(obj) == -1) { + return NULL; + } + sz = PyUnicode_GET_LENGTH(obj); + + + switch(PyUnicode_KIND(obj)) { + case PyUnicode_1BYTE_KIND: + ans = (UChar*) malloc((sz+1) * sizeof(UChar)); + if (ans == NULL) { + PyErr_NoMemory(); + return NULL; + } + u_strFromUTF8( + ans, sz + 1, + (int32_t*) osz, + (char*) PyUnicode_1BYTE_DATA(obj), + (int32_t) sz, + &status); + break; + case PyUnicode_2BYTE_KIND: + ans = (UChar*) malloc((sz+1) * sizeof(UChar)); + // UChar might be more than 2 bytes, so we need to copy manually. + // Hopefully this will be optimized as memcpy where possible. + for(i = 0; i < sz; i++) { + ans[i] = PyUnicode_2BYTE_DATA(obj)[i]; + } + // add null terminator + ans[sz] = 0; + if (osz != NULL) *osz = sz; + break; + case PyUnicode_4BYTE_KIND: + // +1 for null terminator + ans = (UChar*) malloc(2 * (sz+1) * sizeof(UChar)); + if (ans == NULL) { + PyErr_NoMemory(); + return NULL; + } + u_strFromUTF32( + ans, 2 * (sz+1), + (int32_t*) osz, + (UChar32*) PyUnicode_4BYTE_DATA(obj), + (int32_t) sz, + &status); + break; + } + + if (U_FAILURE(status)) { + PyErr_SetString(PyExc_ValueError, u_errorName(status)); + free(ans); + ans = NULL; + return NULL; + } + + return ans; +} + +#ifndef NO_PYTHON_TO_ICU32 +static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) { + UChar32 *ans = NULL; + Py_ssize_t sz = 0; + int i; + + if (!PyUnicode_CheckExact(obj)) { + PyErr_SetString(PyExc_TypeError, "Not a unicode string"); + goto end; + } + if(PyUnicode_READY(obj) == -1) { + return NULL; + } + sz = PyUnicode_GET_LENGTH(obj); + ans = (UChar32*) malloc((sz+1) * sizeof(UChar32)); + if (ans == NULL) { PyErr_NoMemory(); goto end; } + + for(i = 0; i < sz; i++) { + // Work around strict aliasing rules by manually memcpy. + // This should get optimized. + ans[i] = PyUnicode_READ_CHAR(obj, i); + } + ans[sz] = 0; + + if (osz != NULL) *osz = sz; + + return ans; +} +#endif + +#ifndef NO_ICU_TO_PYTHON +static PyObject* icu_to_python(UChar *src, int32_t sz) { + return PyUnicode_DecodeUTF16((char*) src, sz, NULL, NULL); +} +#endif + +#endif // end PY3.3+ \ No newline at end of file diff --git a/src/calibre/utils/matcher.c b/src/calibre/utils/matcher.c index 94ddd9ac35..a42cebaf5c 100644 --- a/src/calibre/utils/matcher.c +++ b/src/calibre/utils/matcher.c @@ -155,10 +155,6 @@ static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, i } static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) { -#if PY_VERSION_HEX >= 0x03030000 -#error Not implemented for python >= 3.3 -#endif - // The positions array stores character positions as byte offsets in string, convert them into character offsets int32_t i, *end; @@ -167,10 +163,14 @@ static void convert_positions(int32_t *positions, int32_t *final_positions, UCha end = final_positions + char_len; for (i = 0; i < byte_len && final_positions < end; i++) { if (positions[i] == -1) continue; +#if PY_VERSION_HEX >= 0x03030000 + *final_positions = positions[i]; +#else #ifdef Py_UNICODE_WIDE *final_positions = u_countChar32(string, positions[i]); #else *final_positions = positions[i]; +#endif #endif final_positions += 1; } @@ -331,7 +331,6 @@ typedef struct { UChar *level2; UChar *level3; UCollator *collator; - } Matcher; // Matcher.__init__() {{{ @@ -349,7 +348,7 @@ static void Matcher_dealloc(Matcher* self) { free_matcher(self); - self->ob_type->tp_free((PyObject*)self); + Py_TYPE(self)->tp_free((PyObject*)self); } #define alloc_uchar(x) (x * 3 + 1) @@ -377,16 +376,16 @@ Matcher_init(Matcher *self, PyObject *args, PyObject *kwds) self->items = (UChar**)calloc(self->item_count, sizeof(UChar*)); self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t)); - self->level1 = python_to_icu(level1, NULL, 1); - self->level2 = python_to_icu(level2, NULL, 1); - self->level3 = python_to_icu(level3, NULL, 1); + self->level1 = python_to_icu(level1, NULL); + self->level2 = python_to_icu(level2, NULL); + self->level3 = python_to_icu(level3, NULL); if (self->items == NULL || self->item_lengths == NULL ) { PyErr_NoMemory(); goto end; } if (self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) goto end; for (i = 0; i < (int32_t)self->item_count; i++) { p = PySequence_Fast_GET_ITEM(py_items, i); - self->items[i] = python_to_icu(p, self->item_lengths + i, 1); + self->items[i] = python_to_icu(p, self->item_lengths + i); if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; } } @@ -409,7 +408,7 @@ Matcher_calculate_scores(Matcher *self, PyObject *args) { if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL; - needle = python_to_icu(pneedle, NULL, 1); + needle = python_to_icu(pneedle, NULL); if (needle == NULL) return NULL; needle_char_len = u_countChar32(needle, -1); items = PyTuple_New(self->item_count); @@ -435,7 +434,7 @@ Matcher_calculate_scores(Matcher *self, PyObject *args) { PyTuple_SET_ITEM(items, (Py_ssize_t)i, score); p = final_positions + (i * needle_char_len); for (j = 0; j < needle_char_len; j++) { - score = PyInt_FromLong((long)p[j]); + score = PyLong_FromLong((long)p[j]); if (score == NULL) { PyErr_NoMemory(); goto end; } PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score); } @@ -455,72 +454,88 @@ static PyMethodDef Matcher_methods[] = { "calculate_scores(query) -> Return the scores for all items given query as a tuple." }, - {NULL} /* Sentinel */ + {NULL, NULL} /* Sentinel */ }; // }}} static PyTypeObject MatcherType = { // {{{ - PyObject_HEAD_INIT(NULL) - 0, /*ob_size*/ - "matcher.Matcher", /*tp_name*/ - sizeof(Matcher), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - (destructor)Matcher_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "Matcher", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - Matcher_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)Matcher_init, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ + PyVarObject_HEAD_INIT(NULL, 0) + /* tp_name */ "matcher.Matcher", + /* tp_basicsiz */ sizeof(Matcher), + /* tp_itemsize */ 0, + /* tp_dealloc */ (destructor)Matcher_dealloc, + /* tp_print */ 0, + /* tp_getattr */ 0, + /* tp_setattr */ 0, + /* tp_as_async */ 0, + /* tp_repr */ 0, + /* tp_as_number */ 0, + /* tp_as_sequence */ 0, + /* tp_as_mapping */ 0, + /* tp_hash */ 0, + /* tp_call */ 0, + /* tp_str */ 0, + /* tp_getattro */ 0, + /* tp_setattro */ 0, + /* tp_as_buffer */ 0, + /* tp_flags */ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + /* tp_doc */ "Matcher", + /* tp_traverse */ 0, + /* tp_clear */ 0, + /* tp_richcompare */ 0, + /* tp_weaklistoffset */ 0, + /* tp_iter */ 0, + /* tp_iternext */ 0, + /* tp_methods */ Matcher_methods, + /* tp_members */ 0, + /* tp_getset */ 0, + /* tp_base */ 0, + /* tp_dict */ 0, + /* tp_descr_get */ 0, + /* tp_descr_set */ 0, + /* tp_dictoffset */ 0, + /* tp_init */ (initproc)Matcher_init, + /* tp_alloc */ 0, + /* tp_new */ PyType_GenericNew, }; // }}} -static PyMethodDef matcher_methods[] = { - {NULL, NULL, 0, NULL} +#if PY_MAJOR_VERSION >= 3 +#define INITERROR return NULL +static struct PyModuleDef matcher_module = { + /* m_base */ PyModuleDef_HEAD_INIT, + /* m_name */ "matcher", + /* m_doc */ "Find subsequence matches.", + /* m_size */ -1, + /* m_methods */ 0, + /* m_slots */ 0, + /* m_traverse */ 0, + /* m_clear */ 0, + /* m_free */ 0, }; +CALIBRE_MODINIT_FUNC PyInit_matcher(void) { + PyObject *mod = PyModule_Create(&matcher_module); +#else +#define INITERROR return +CALIBRE_MODINIT_FUNC initmatcher(void) { + PyObject *mod = Py_InitModule3("matcher", NULL, "Find subsequence matches"); +#endif -CALIBRE_MODINIT_FUNC -initmatcher(void) { - PyObject *m; - MatcherType.tp_new = PyType_GenericNew; - if (PyType_Ready(&MatcherType) < 0) - return; - m = Py_InitModule3("matcher", matcher_methods, "Find subsequence matches"); - if (m == NULL) return; + if (mod == NULL) INITERROR; + + if (PyType_Ready(&MatcherType) < 0) { + INITERROR; + } Py_INCREF(&MatcherType); - PyModule_AddObject(m, "Matcher", (PyObject *)&MatcherType); - -} - - + if(PyModule_AddObject(mod, "Matcher", (PyObject *)&MatcherType) < 0) { + Py_DECREF(&MatcherType); + INITERROR; + } +#if PY_MAJOR_VERSION >= 3 + return mod; +#endif +} \ No newline at end of file