Build matcher in py3

I'm not sure how this ever worked without PyTypeObject.tp_new, but
adding a generic constructor seems to work fine.
This commit is contained in:
Flaviu Tamas 2018-10-10 22:19:36 -04:00
parent a78682093a
commit a3d6204304
3 changed files with 215 additions and 92 deletions

View File

@ -145,7 +145,7 @@ icu_Collator_sort_key(icu_Collator *self, PyObject *input) {
uint8_t *buf2 = NULL;
PyObject *ans = NULL;
buf = python_to_icu(input, &sz, 1);
buf = python_to_icu(input, &sz);
if (buf == NULL) return NULL;
bsz = 7 * sz + 1;
@ -176,9 +176,9 @@ icu_Collator_strcmp(icu_Collator *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
a = python_to_icu(a_, &asz, 1);
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz, 1);
b = python_to_icu(b_, &bsz);
if (b == NULL) goto end;
res = ucol_strcoll(self->collator, a, asz, b, bsz);
end:
@ -202,9 +202,9 @@ icu_Collator_find(icu_Collator *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
a = python_to_icu(a_, &asz, 1);
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz, 1);
b = python_to_icu(b_, &bsz);
if (b == NULL) goto end;
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
@ -241,10 +241,10 @@ icu_Collator_contains(icu_Collator *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
a = python_to_icu(a_, &asz, 1);
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
if (asz == 0) { found = TRUE; goto end; }
b = python_to_icu(b_, &bsz, 1);
b = python_to_icu(b_, &bsz);
if (b == NULL) goto end;
search = usearch_openFromCollator(a, asz, b, bsz, self->collator, NULL, &status);
@ -313,9 +313,9 @@ icu_Collator_startswith(icu_Collator *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "OO", &a_, &b_)) return NULL;
a = python_to_icu(a_, &asz, 1);
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
b = python_to_icu(b_, &bsz, 1);
b = python_to_icu(b_, &bsz);
if (b == NULL) goto end;
if (asz < bsz) goto end;
@ -341,7 +341,7 @@ icu_Collator_collation_order(icu_Collator *self, PyObject *a_) {
UCollationElements *iter = NULL;
int order = 0, len = -1;
a = python_to_icu(a_, &asz, 1);
a = python_to_icu(a_, &asz);
if (a == NULL) goto end;
iter = ucol_openElements(self->collator, a, asz, &status);
@ -578,7 +578,7 @@ icu_BreakIterator_set_text(icu_BreakIterator *self, PyObject *input) {
UChar *buf = NULL;
UErrorCode status = U_ZERO_ERROR;
buf = python_to_icu(input, &sz, 1);
buf = python_to_icu(input, &sz);
if (buf == NULL) return NULL;
ubrk_setText(self->break_iterator, buf, sz, &status);
if (U_FAILURE(status)) {
@ -602,7 +602,7 @@ icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
UChar *buf = NULL, *needle = NULL;
int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;
buf = python_to_icu(token, &sz, 1);
buf = python_to_icu(token, &sz);
if (buf == NULL) return NULL;
if (sz < 1) goto end;
needle = buf;
@ -796,7 +796,7 @@ static PyObject* icu_change_case(PyObject *self, PyObject *args) {
return NULL;
}
input_buf = python_to_icu(input, &sz, 1);
input_buf = python_to_icu(input, &sz);
if (input_buf == NULL) goto end;
output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
if (output_buf == NULL) { PyErr_NoMemory(); goto end; }
@ -830,7 +830,7 @@ static PyObject* icu_swap_case(PyObject *self, PyObject *input) {
UChar32 *buf = NULL;
int32_t sz = 0, sz32 = 0, i = 0;
input_buf = python_to_icu(input, &sz, 1);
input_buf = python_to_icu(input, &sz);
if (input_buf == NULL) goto end;
output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
buf = (UChar32*) calloc(2 * sz, sizeof(UChar32));
@ -922,7 +922,7 @@ icu_character_name(PyObject *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "O|O", &input, &palias)) return NULL;
if (palias != NULL && PyObject_IsTrue(palias)) alias = 1;
buf = python_to_icu(input, &sz, 1);
buf = python_to_icu(input, &sz);
if (buf == NULL) goto end;
U16_GET(buf, 0, 0, sz, code);
if (alias) {
@ -984,7 +984,7 @@ icu_ord_string(PyObject *self, PyObject *input) {
int32_t sz = 0, i = 0;
PyObject *ans = NULL, *temp = NULL;
input_buf = python_to_icu32(input, &sz, 1);
input_buf = python_to_icu32(input, &sz);
if (input_buf == NULL) goto end;
ans = PyTuple_New(sz);
if (ans == NULL) goto end;
@ -1031,7 +1031,7 @@ icu_normalize(PyObject *self, PyObject *args) {
goto end;
}
source = python_to_icu(src, &sz, 1);
source = python_to_icu(src, &sz);
if (source == NULL) goto end;
cap = 2 * sz;
dest = (UChar*) calloc(cap, sizeof(UChar));
@ -1069,7 +1069,7 @@ icu_roundtrip(PyObject *self, PyObject *src) {
UChar *icu = NULL;
PyObject *ret = NULL;
icu = python_to_icu(src, &sz, 1);
icu = python_to_icu(src, &sz);
if (icu != NULL) {
ret = icu_to_python(icu, sz);
free(icu);
@ -1105,7 +1105,7 @@ icu_string_length(PyObject *self, PyObject *src) {
int32_t sz = 0;
UChar *icu = NULL;
icu = python_to_icu(src, &sz, 1);
icu = python_to_icu(src, &sz);
if (icu == NULL) return NULL;
sz = u_countChar32(icu, sz);
free(icu);

View File

@ -22,10 +22,11 @@
#include <unicode/unorm2.h>
#include <unicode/ubrk.h>
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#if PY_VERSION_HEX < 0x03030000 && PY_VERSION_HEX > 0x03000000
#error Not implemented for python 3.0 to 3.2
#endif
#if PY_VERSION_HEX < 0x03000000
#define MIN(x, y) ((x)<(y)) ? (x) : (y)
#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff)
#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff)
@ -33,14 +34,14 @@
// Roundtripping will need to be implemented differently for python 3.3+ where strings are stored with variable widths
#ifndef NO_PYTHON_TO_ICU
static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
static UChar* python_to_icu(PyObject *obj, int32_t *osz) {
UChar *ans = NULL;
Py_ssize_t sz = 0;
#ifdef Py_UNICODE_WIDE
UErrorCode status = U_ZERO_ERROR;
#endif
if (do_check && !PyUnicode_CheckExact(obj)) {
if (!PyUnicode_CheckExact(obj)) {
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
goto end;
}
@ -73,14 +74,14 @@ end:
}
#ifndef NO_PYTHON_TO_ICU32
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz, uint8_t do_check) {
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) {
UChar32 *ans = NULL;
Py_ssize_t sz = 0;
#ifndef Py_UNICODE_WIDE
UErrorCode status = U_ZERO_ERROR;
#endif
if (do_check && !PyUnicode_CheckExact(obj)) {
if (!PyUnicode_CheckExact(obj)) {
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
goto end;
}
@ -114,3 +115,110 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) {
#endif
}
#endif
#else // end PY2; start PY3.3+
static UChar* python_to_icu(PyObject *obj, int32_t *osz) {
UChar *ans = NULL;
Py_ssize_t sz = 0;
UErrorCode status = U_ZERO_ERROR;
int i;
if (!PyUnicode_CheckExact(obj)) {
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
return NULL;
}
if(PyUnicode_READY(obj) == -1) {
return NULL;
}
sz = PyUnicode_GET_LENGTH(obj);
switch(PyUnicode_KIND(obj)) {
case PyUnicode_1BYTE_KIND:
ans = (UChar*) malloc((sz+1) * sizeof(UChar));
if (ans == NULL) {
PyErr_NoMemory();
return NULL;
}
u_strFromUTF8(
ans, sz + 1,
(int32_t*) osz,
(char*) PyUnicode_1BYTE_DATA(obj),
(int32_t) sz,
&status);
break;
case PyUnicode_2BYTE_KIND:
ans = (UChar*) malloc((sz+1) * sizeof(UChar));
// UChar might be more than 2 bytes, so we need to copy manually.
// Hopefully this will be optimized as memcpy where possible.
for(i = 0; i < sz; i++) {
ans[i] = PyUnicode_2BYTE_DATA(obj)[i];
}
// add null terminator
ans[sz] = 0;
if (osz != NULL) *osz = sz;
break;
case PyUnicode_4BYTE_KIND:
// +1 for null terminator
ans = (UChar*) malloc(2 * (sz+1) * sizeof(UChar));
if (ans == NULL) {
PyErr_NoMemory();
return NULL;
}
u_strFromUTF32(
ans, 2 * (sz+1),
(int32_t*) osz,
(UChar32*) PyUnicode_4BYTE_DATA(obj),
(int32_t) sz,
&status);
break;
}
if (U_FAILURE(status)) {
PyErr_SetString(PyExc_ValueError, u_errorName(status));
free(ans);
ans = NULL;
return NULL;
}
return ans;
}
#ifndef NO_PYTHON_TO_ICU32
static UChar32* python_to_icu32(PyObject *obj, int32_t *osz) {
UChar32 *ans = NULL;
Py_ssize_t sz = 0;
int i;
if (!PyUnicode_CheckExact(obj)) {
PyErr_SetString(PyExc_TypeError, "Not a unicode string");
goto end;
}
if(PyUnicode_READY(obj) == -1) {
return NULL;
}
sz = PyUnicode_GET_LENGTH(obj);
ans = (UChar32*) malloc((sz+1) * sizeof(UChar32));
if (ans == NULL) { PyErr_NoMemory(); goto end; }
for(i = 0; i < sz; i++) {
// Work around strict aliasing rules by manually memcpy.
// This should get optimized.
ans[i] = PyUnicode_READ_CHAR(obj, i);
}
ans[sz] = 0;
if (osz != NULL) *osz = sz;
return ans;
}
#endif
#ifndef NO_ICU_TO_PYTHON
static PyObject* icu_to_python(UChar *src, int32_t sz) {
return PyUnicode_DecodeUTF16((char*) src, sz, NULL, NULL);
}
#endif
#endif // end PY3.3+

View File

@ -155,10 +155,6 @@ static double calc_score_for_char(MatchInfo *m, UChar32 last, UChar32 current, i
}
static void convert_positions(int32_t *positions, int32_t *final_positions, UChar *string, int32_t char_len, int32_t byte_len, double score) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
// The positions array stores character positions as byte offsets in string, convert them into character offsets
int32_t i, *end;
@ -167,10 +163,14 @@ static void convert_positions(int32_t *positions, int32_t *final_positions, UCha
end = final_positions + char_len;
for (i = 0; i < byte_len && final_positions < end; i++) {
if (positions[i] == -1) continue;
#if PY_VERSION_HEX >= 0x03030000
*final_positions = positions[i];
#else
#ifdef Py_UNICODE_WIDE
*final_positions = u_countChar32(string, positions[i]);
#else
*final_positions = positions[i];
#endif
#endif
final_positions += 1;
}
@ -331,7 +331,6 @@ typedef struct {
UChar *level2;
UChar *level3;
UCollator *collator;
} Matcher;
// Matcher.__init__() {{{
@ -349,7 +348,7 @@ static void
Matcher_dealloc(Matcher* self)
{
free_matcher(self);
self->ob_type->tp_free((PyObject*)self);
Py_TYPE(self)->tp_free((PyObject*)self);
}
#define alloc_uchar(x) (x * 3 + 1)
@ -377,16 +376,16 @@ Matcher_init(Matcher *self, PyObject *args, PyObject *kwds)
self->items = (UChar**)calloc(self->item_count, sizeof(UChar*));
self->item_lengths = (int32_t*)calloc(self->item_count, sizeof(uint32_t));
self->level1 = python_to_icu(level1, NULL, 1);
self->level2 = python_to_icu(level2, NULL, 1);
self->level3 = python_to_icu(level3, NULL, 1);
self->level1 = python_to_icu(level1, NULL);
self->level2 = python_to_icu(level2, NULL);
self->level3 = python_to_icu(level3, NULL);
if (self->items == NULL || self->item_lengths == NULL ) { PyErr_NoMemory(); goto end; }
if (self->level1 == NULL || self->level2 == NULL || self->level3 == NULL) goto end;
for (i = 0; i < (int32_t)self->item_count; i++) {
p = PySequence_Fast_GET_ITEM(py_items, i);
self->items[i] = python_to_icu(p, self->item_lengths + i, 1);
self->items[i] = python_to_icu(p, self->item_lengths + i);
if (self->items[i] == NULL) { PyErr_NoMemory(); goto end; }
}
@ -409,7 +408,7 @@ Matcher_calculate_scores(Matcher *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "O", &pneedle)) return NULL;
needle = python_to_icu(pneedle, NULL, 1);
needle = python_to_icu(pneedle, NULL);
if (needle == NULL) return NULL;
needle_char_len = u_countChar32(needle, -1);
items = PyTuple_New(self->item_count);
@ -435,7 +434,7 @@ Matcher_calculate_scores(Matcher *self, PyObject *args) {
PyTuple_SET_ITEM(items, (Py_ssize_t)i, score);
p = final_positions + (i * needle_char_len);
for (j = 0; j < needle_char_len; j++) {
score = PyInt_FromLong((long)p[j]);
score = PyLong_FromLong((long)p[j]);
if (score == NULL) { PyErr_NoMemory(); goto end; }
PyTuple_SET_ITEM(PyTuple_GET_ITEM(positions, (Py_ssize_t)i), (Py_ssize_t)j, score);
}
@ -455,72 +454,88 @@ static PyMethodDef Matcher_methods[] = {
"calculate_scores(query) -> Return the scores for all items given query as a tuple."
},
{NULL} /* Sentinel */
{NULL, NULL} /* Sentinel */
};
// }}}
static PyTypeObject MatcherType = { // {{{
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"matcher.Matcher", /*tp_name*/
sizeof(Matcher), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)Matcher_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
"Matcher", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
Matcher_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)Matcher_init, /* tp_init */
0, /* tp_alloc */
0, /* tp_new */
PyVarObject_HEAD_INIT(NULL, 0)
/* tp_name */ "matcher.Matcher",
/* tp_basicsiz */ sizeof(Matcher),
/* tp_itemsize */ 0,
/* tp_dealloc */ (destructor)Matcher_dealloc,
/* tp_print */ 0,
/* tp_getattr */ 0,
/* tp_setattr */ 0,
/* tp_as_async */ 0,
/* tp_repr */ 0,
/* tp_as_number */ 0,
/* tp_as_sequence */ 0,
/* tp_as_mapping */ 0,
/* tp_hash */ 0,
/* tp_call */ 0,
/* tp_str */ 0,
/* tp_getattro */ 0,
/* tp_setattro */ 0,
/* tp_as_buffer */ 0,
/* tp_flags */ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
/* tp_doc */ "Matcher",
/* tp_traverse */ 0,
/* tp_clear */ 0,
/* tp_richcompare */ 0,
/* tp_weaklistoffset */ 0,
/* tp_iter */ 0,
/* tp_iternext */ 0,
/* tp_methods */ Matcher_methods,
/* tp_members */ 0,
/* tp_getset */ 0,
/* tp_base */ 0,
/* tp_dict */ 0,
/* tp_descr_get */ 0,
/* tp_descr_set */ 0,
/* tp_dictoffset */ 0,
/* tp_init */ (initproc)Matcher_init,
/* tp_alloc */ 0,
/* tp_new */ PyType_GenericNew,
}; // }}}
static PyMethodDef matcher_methods[] = {
{NULL, NULL, 0, NULL}
#if PY_MAJOR_VERSION >= 3
#define INITERROR return NULL
static struct PyModuleDef matcher_module = {
/* m_base */ PyModuleDef_HEAD_INIT,
/* m_name */ "matcher",
/* m_doc */ "Find subsequence matches.",
/* m_size */ -1,
/* m_methods */ 0,
/* m_slots */ 0,
/* m_traverse */ 0,
/* m_clear */ 0,
/* m_free */ 0,
};
CALIBRE_MODINIT_FUNC PyInit_matcher(void) {
PyObject *mod = PyModule_Create(&matcher_module);
#else
#define INITERROR return
CALIBRE_MODINIT_FUNC initmatcher(void) {
PyObject *mod = Py_InitModule3("matcher", NULL, "Find subsequence matches");
#endif
CALIBRE_MODINIT_FUNC
initmatcher(void) {
PyObject *m;
MatcherType.tp_new = PyType_GenericNew;
if (PyType_Ready(&MatcherType) < 0)
return;
m = Py_InitModule3("matcher", matcher_methods, "Find subsequence matches");
if (m == NULL) return;
if (mod == NULL) INITERROR;
if (PyType_Ready(&MatcherType) < 0) {
INITERROR;
}
Py_INCREF(&MatcherType);
PyModule_AddObject(m, "Matcher", (PyObject *)&MatcherType);
if(PyModule_AddObject(mod, "Matcher", (PyObject *)&MatcherType) < 0) {
Py_DECREF(&MatcherType);
INITERROR;
}
#if PY_MAJOR_VERSION >= 3
return mod;
#endif
}