Build tokenizer on py3

- The actual unicode manipulation wasn't converted to the new 3.3+
style, since it would lead to lots of ifdefs and become quite ugly. This
can be done when py2 support is dropped. The drawbacks will be
temporary slower code (there will be extra copying required).
This commit is contained in:
Flaviu Tamas 2018-11-27 00:29:40 -05:00
parent 526835e555
commit cf576342e1
No known key found for this signature in database
GPG Key ID: D6BF32C876496756

View File

@ -34,7 +34,7 @@ tokenizer_Token_dealloc(tokenizer_Token* self)
Py_XDECREF(self->unit); self->unit = NULL; Py_XDECREF(self->unit); self->unit = NULL;
Py_XDECREF(self->line); self->line = NULL; Py_XDECREF(self->line); self->line = NULL;
Py_XDECREF(self->column); self->column = NULL; Py_XDECREF(self->column); self->column = NULL;
self->ob_type->tp_free((PyObject*)self); Py_TYPE(self)->tp_free((PyObject*)self);
} }
@ -46,7 +46,8 @@ tokenizer_Token_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self == NULL) return PyErr_NoMemory(); if (self == NULL) return PyErr_NoMemory();
if (!PyArg_ParseTuple(args, "OOOOOO", &(self->type), &(self->_as_css), &(self->value), &(self->unit), &(self->line), &(self->column))) { if (!PyArg_ParseTuple(args, "OOOOOO", &(self->type), &(self->_as_css), &(self->value), &(self->unit), &(self->line), &(self->column))) {
self->ob_type->tp_free((PyObject*)self); return NULL; Py_TYPE(self)->tp_free((PyObject *) self);
return NULL;
} }
Py_INCREF(self->type); Py_INCREF(self->_as_css); Py_INCREF(self->value); Py_INCREF(self->unit); Py_INCREF(self->line); Py_INCREF(self->column); Py_INCREF(self->type); Py_INCREF(self->_as_css); Py_INCREF(self->value); Py_INCREF(self->unit); Py_INCREF(self->line); Py_INCREF(self->column);
self->is_container = Py_False; Py_INCREF(self->is_container); self->is_container = Py_False; Py_INCREF(self->is_container);
@ -54,15 +55,24 @@ tokenizer_Token_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
return (PyObject *)self; return (PyObject *)self;
} }
#if PY_MAJOR_VERSION >= 3
#define PyObject_Unicode_Compat(arg) PyObject_Str(arg)
#else
#define PyObject_Unicode_Compat(arg) PyObject_Unicode(arg)
#endif
static PyObject * static PyObject *
tokenizer_Token_repr(tokenizer_Token *self) { tokenizer_Token_repr(tokenizer_Token *self) {
PyObject *type = NULL, *line = NULL, *column = NULL, *value = NULL, *ans = NULL, *unit = NULL; PyObject *type = NULL, *line = NULL, *column = NULL, *value = NULL, *ans = NULL, *unit = NULL;
if (!self->type || !self->line || !self->column || !self->value) if (!self->type || !self->line || !self->column || !self->value)
return PyBytes_FromString("<Token NULL fields>"); return PyBytes_FromString("<Token NULL fields>");
type = PyObject_Unicode(self->type); line = PyObject_Unicode(self->line); column = PyObject_Unicode(self->column); value = PyObject_Unicode(self->value); type = PyObject_Unicode_Compat(self->type);
line = PyObject_Unicode_Compat(self->line);
column = PyObject_Unicode_Compat(self->column);
value = PyObject_Unicode_Compat(self->value);
if (type && line && column && value) { if (type && line && column && value) {
if (self->unit != NULL && PyObject_IsTrue(self->unit)) { if (self->unit != NULL && PyObject_IsTrue(self->unit)) {
unit = PyObject_Unicode(self->unit); unit = PyObject_Unicode_Compat(self->unit);
if (unit != NULL) if (unit != NULL)
ans = PyUnicode_FromFormat("<Token %U at %U:%U %U%U>", type, line, column, value, unit); ans = PyUnicode_FromFormat("<Token %U at %U:%U %U%U>", type, line, column, value, unit);
else else
@ -103,45 +113,44 @@ static PyMethodDef tokenizer_Token_methods[] = {
}; };
static PyTypeObject tokenizer_TokenType = { // {{{ static PyTypeObject tokenizer_TokenType = { // {{{
PyObject_HEAD_INIT(NULL) PyVarObject_HEAD_INIT(NULL, 0)
0, /*ob_size*/ /* tp_name */ "tokenizer.Token",
"tokenizer.Token", /*tp_name*/ /* tp_basicsize */ sizeof(tokenizer_Token),
sizeof(tokenizer_Token), /*tp_basicsize*/ /* tp_itemsize */ 0,
0, /*tp_itemsize*/ /* tp_dealloc */ (destructor) tokenizer_Token_dealloc,
(destructor)tokenizer_Token_dealloc, /*tp_dealloc*/ /* tp_print */ 0,
0, /*tp_print*/ /* tp_getattr */ 0,
0, /*tp_getattr*/ /* tp_setattr */ 0,
0, /*tp_setattr*/ /* tp_compare */ 0,
0, /*tp_compare*/ /* tp_repr */ (reprfunc) tokenizer_Token_repr,
(reprfunc)tokenizer_Token_repr, /*tp_repr*/ /* tp_as_number */ 0,
0, /*tp_as_number*/ /* tp_as_sequence */ 0,
0, /*tp_as_sequence*/ /* tp_as_mapping */ 0,
0, /*tp_as_mapping*/ /* tp_hash */ 0,
0, /*tp_hash */ /* tp_call */ 0,
0, /*tp_call*/ /* tp_str */ 0,
0, /*tp_str*/ /* tp_getattro */ 0,
0, /*tp_getattro*/ /* tp_setattro */ 0,
0, /*tp_setattro*/ /* tp_as_buffer */ 0,
0, /*tp_as_buffer*/ /* tp_flags */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ /* tp_doc */ "Token",
"Token", /* tp_doc */ /* tp_traverse */ 0,
0, /* tp_traverse */ /* tp_clear */ 0,
0, /* tp_clear */ /* tp_richcompare */ 0,
0, /* tp_richcompare */ /* tp_weaklistoffset */ 0,
0, /* tp_weaklistoffset */ /* tp_iter */ 0,
0, /* tp_iter */ /* tp_iternext */ 0,
0, /* tp_iternext */ /* tp_methods */ tokenizer_Token_methods,
tokenizer_Token_methods, /* tp_methods */ /* tp_members */ tokenizer_Token_members,
tokenizer_Token_members, /* tp_members */ /* tp_getset */ 0,
0, /* tp_getset */ /* tp_base */ 0,
0, /* tp_base */ /* tp_dict */ 0,
0, /* tp_dict */ /* tp_descr_get */ 0,
0, /* tp_descr_get */ /* tp_descr_set */ 0,
0, /* tp_descr_set */ /* tp_dictoffset */ 0,
0, /* tp_dictoffset */ /* tp_init */ 0,
0, /* tp_init */ /* tp_alloc */ 0,
0, /* tp_alloc */ /* tp_new */ tokenizer_Token_new,
tokenizer_Token_new, /* tp_new */
}; // }}} }; // }}}
// }}} // }}}
@ -170,7 +179,7 @@ tokenize_init(PyObject *self, PyObject *args) {
Py_INCREF(COMPILED_TOKEN_REGEXPS); Py_INCREF(UNICODE_UNESCAPE); Py_INCREF(NEWLINE_UNESCAPE); Py_INCREF(SIMPLE_UNESCAPE); Py_INCREF(FIND_NEWLINES); Py_INCREF(TOKEN_DISPATCH); Py_INCREF(COMPILED_TOKEN_REGEXPS); Py_INCREF(UNICODE_UNESCAPE); Py_INCREF(NEWLINE_UNESCAPE); Py_INCREF(SIMPLE_UNESCAPE); Py_INCREF(FIND_NEWLINES); Py_INCREF(TOKEN_DISPATCH);
Py_INCREF(COLON); Py_INCREF(SCOLON); Py_INCREF(LPAR); Py_INCREF(RPAR); Py_INCREF(LBRACE); Py_INCREF(RBRACE); Py_INCREF(LBOX); Py_INCREF(RBOX); Py_INCREF(DELIM_TOK); Py_INCREF(INTEGER); Py_INCREF(STRING_TOK); Py_INCREF(COLON); Py_INCREF(SCOLON); Py_INCREF(LPAR); Py_INCREF(RPAR); Py_INCREF(LBRACE); Py_INCREF(RBRACE); Py_INCREF(LBOX); Py_INCREF(RBOX); Py_INCREF(DELIM_TOK); Py_INCREF(INTEGER); Py_INCREF(STRING_TOK);
#define SETCONST(x) x = PyInt_AsSsize_t(PyDict_GetItemString(cti, #x)) #define SETCONST(x) x = PyLong_AsSsize_t(PyDict_GetItemString(cti, #x))
SETCONST(BAD_COMMENT); SETCONST(BAD_STRING); SETCONST(PERCENTAGE); SETCONST(DIMENSION); SETCONST(ATKEYWORD); SETCONST(FUNCTION); SETCONST(COMMENT); SETCONST(NUMBER); SETCONST(STRING); SETCONST(IDENT); SETCONST(HASH); SETCONST(URI); SETCONST(BAD_COMMENT); SETCONST(BAD_STRING); SETCONST(PERCENTAGE); SETCONST(DIMENSION); SETCONST(ATKEYWORD); SETCONST(FUNCTION); SETCONST(COMMENT); SETCONST(NUMBER); SETCONST(STRING); SETCONST(IDENT); SETCONST(HASH); SETCONST(URI);
Py_RETURN_NONE; Py_RETURN_NONE;
@ -178,9 +187,6 @@ tokenize_init(PyObject *self, PyObject *args) {
static int static int
contains_char(PyObject *haystack, Py_UNICODE c) { contains_char(PyObject *haystack, Py_UNICODE c) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
Py_ssize_t i = 0; Py_ssize_t i = 0;
Py_UNICODE *data = PyUnicode_AS_UNICODE(haystack); Py_UNICODE *data = PyUnicode_AS_UNICODE(haystack);
for (i = 0; i < PyUnicode_GET_SIZE(haystack); i++) { for (i = 0; i < PyUnicode_GET_SIZE(haystack); i++) {
@ -190,25 +196,31 @@ contains_char(PyObject *haystack, Py_UNICODE c) {
} }
static PyObject *unicode_to_number(PyObject *src) { static PyObject *unicode_to_number(PyObject *src) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
PyObject *raw = NULL, *ans = NULL; PyObject *raw = NULL, *ans = NULL;
raw = PyUnicode_AsASCIIString(src); raw = PyUnicode_AsASCIIString(src);
if (raw == NULL) { return NULL; } if (raw == NULL) { return NULL; }
if (contains_char(src, '.')) { if (contains_char(src, '.')) {
#if PY_MAJOR_VERSION >= 3
ans = PyFloat_FromString(raw);
#else
ans = PyFloat_FromString(raw, NULL); ans = PyFloat_FromString(raw, NULL);
#endif
} else { } else {
#if PY_MAJOR_VERSION >= 3
ans = PyLong_FromUnicodeObject(raw, 10);
#else
ans = PyInt_FromString(PyString_AS_STRING(raw), NULL, 10); ans = PyInt_FromString(PyString_AS_STRING(raw), NULL, 10);
#endif
} }
Py_DECREF(raw); Py_DECREF(raw);
return ans; return ans;
} }
// TODO Convert this to use the 3.3+ unicode API
// doing so while preserving py2 compat would lead to a giant mess of #ifs, so
// it's better to do it when calibre is all migrated to py3
static void lowercase(PyObject *x) { static void lowercase(PyObject *x) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
Py_ssize_t i = 0; Py_ssize_t i = 0;
Py_UNICODE *data = PyUnicode_AS_UNICODE(x); Py_UNICODE *data = PyUnicode_AS_UNICODE(x);
for (i = 0; i < PyUnicode_GET_SIZE(x); i++) for (i = 0; i < PyUnicode_GET_SIZE(x); i++)
@ -216,9 +228,6 @@ static void lowercase(PyObject *x) {
} }
static PyObject *clone_unicode(Py_UNICODE *x, Py_ssize_t sz) { static PyObject *clone_unicode(Py_UNICODE *x, Py_ssize_t sz) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
PyObject *ans = PyUnicode_FromUnicode(NULL, sz); PyObject *ans = PyUnicode_FromUnicode(NULL, sz);
if (ans == NULL) return PyErr_NoMemory(); if (ans == NULL) return PyErr_NoMemory();
memcpy(PyUnicode_AS_UNICODE(ans), x, sz * sizeof(Py_UNICODE)); memcpy(PyUnicode_AS_UNICODE(ans), x, sz * sizeof(Py_UNICODE));
@ -227,9 +236,6 @@ static PyObject* clone_unicode(Py_UNICODE *x, Py_ssize_t sz) {
static PyObject* static PyObject*
tokenize_flat(PyObject *self, PyObject *args) { tokenize_flat(PyObject *self, PyObject *args) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif
Py_UNICODE *css_source = NULL, c = 0, codepoint = 0; Py_UNICODE *css_source = NULL, c = 0, codepoint = 0;
PyObject *ic = NULL, *token = NULL, *tokens = NULL, *type_name = NULL, *css_value = NULL, *value = NULL, *unit = NULL, *tries = NULL, *match = NULL, *match_func = NULL, *py_source = NULL, *item = NULL, *newlines = NULL; PyObject *ic = NULL, *token = NULL, *tokens = NULL, *type_name = NULL, *css_value = NULL, *value = NULL, *unit = NULL, *tries = NULL, *match = NULL, *match_func = NULL, *py_source = NULL, *item = NULL, *newlines = NULL;
int ignore_comments = 0; int ignore_comments = 0;
@ -272,7 +278,7 @@ tokenize_flat(PyObject *self, PyObject *args) {
if (match != Py_None) { if (match != Py_None) {
css_value = PyObject_CallMethod(match, "group", NULL); css_value = PyObject_CallMethod(match, "group", NULL);
if (css_value == NULL) { goto error; } if (css_value == NULL) { goto error; }
type_ = PyInt_AsSsize_t(PyTuple_GET_ITEM(item, 0)); type_ = PyLong_AsSsize_t(PyTuple_GET_ITEM(item, 0));
type_name = PyTuple_GET_ITEM(item, 1); type_name = PyTuple_GET_ITEM(item, 1);
Py_INCREF(type_name); Py_INCREF(type_name);
break; break;
@ -415,17 +421,41 @@ static PyMethodDef tokenizer_methods[] = {
{NULL, NULL, 0, NULL} {NULL, NULL, 0, NULL}
}; };
#if PY_MAJOR_VERSION >= 3
#define INITERROR return NULL
static struct PyModuleDef tokenizer_module = {
/* m_base */ PyModuleDef_HEAD_INIT,
/* m_name */ "tokenizer",
/* m_doc */ "Implementation of tokenizer in C for speed.",
/* m_size */ -1,
/* m_methods */ tokenizer_methods,
/* m_slots */ 0,
/* m_traverse */ 0,
/* m_clear */ 0,
/* m_free */ 0,
};
CALIBRE_MODINIT_FUNC CALIBRE_MODINIT_FUNC PyInit_tokenizer(void) {
inittokenizer(void) {
PyObject *m;
if (PyType_Ready(&tokenizer_TokenType) < 0) if (PyType_Ready(&tokenizer_TokenType) < 0)
return; INITERROR;
m = Py_InitModule3("tokenizer", tokenizer_methods, PyObject *mod = PyModule_Create(&tokenizer_module);
"Implementation of tokenizer in C for speed." #else
); #define INITERROR return
if (m == NULL) return; CALIBRE_MODINIT_FUNC inittokenizer(void) {
if (PyType_Ready(&tokenizer_TokenType) < 0)
INITERROR;
PyObject *mod = Py_InitModule3("tokenizer", tokenizer_methods,
"Implementation of tokenizer in C for speed.");
#endif
if (mod == NULL) INITERROR;
Py_INCREF(&tokenizer_TokenType); Py_INCREF(&tokenizer_TokenType);
PyModule_AddObject(m, "Token", (PyObject *)&tokenizer_TokenType); PyModule_AddObject(mod, "Token", (PyObject *) &tokenizer_TokenType);
#if PY_MAJOR_VERSION >= 3
return mod;
#endif
} }