/* * tokenizer.c * Copyright (C) 2014 Kovid Goyal * * Distributed under terms of the GPL3 license. */ #define UNICODE #define PY_SSIZE_T_CLEAN #include #include // Token type definition {{{ typedef struct { PyObject_HEAD // Type-specific fields go here. PyObject *is_container; PyObject *type; PyObject *_as_css; PyObject *value; PyObject *unit; PyObject *line; PyObject *column; } tokenizer_Token; static void tokenizer_Token_dealloc(tokenizer_Token* self) { Py_XDECREF(self->is_container); self->is_container = NULL; Py_XDECREF(self->type); self->type = NULL; Py_XDECREF(self->_as_css); self->_as_css = NULL; Py_XDECREF(self->value); self->value = NULL; Py_XDECREF(self->unit); self->unit = NULL; Py_XDECREF(self->line); self->line = NULL; Py_XDECREF(self->column); self->column = NULL; Py_TYPE(self)->tp_free((PyObject*)self); } static PyObject * tokenizer_Token_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { tokenizer_Token *self = NULL; self = (tokenizer_Token *)type->tp_alloc(type, 0); if (self == NULL) return PyErr_NoMemory(); if (!PyArg_ParseTuple(args, "OOOOOO", &(self->type), &(self->_as_css), &(self->value), &(self->unit), &(self->line), &(self->column))) { Py_TYPE(self)->tp_free((PyObject *) self); return NULL; } Py_INCREF(self->type); Py_INCREF(self->_as_css); Py_INCREF(self->value); Py_INCREF(self->unit); Py_INCREF(self->line); Py_INCREF(self->column); self->is_container = Py_False; Py_INCREF(self->is_container); return (PyObject *)self; } static PyObject * tokenizer_Token_repr(tokenizer_Token *self) { PyObject *type = NULL, *line = NULL, *column = NULL, *value = NULL, *ans = NULL, *unit = NULL; if (!self->type || !self->line || !self->column || !self->value) return PyBytes_FromString(""); type = PyObject_Str(self->type); line = PyObject_Str(self->line); column = PyObject_Str(self->column); value = PyObject_Str(self->value); if (type && line && column && value) { if (self->unit != NULL && PyObject_IsTrue(self->unit)) { unit = PyObject_Str(self->unit); if (unit != NULL) ans = PyUnicode_FromFormat("", type, line, column, value, unit); else PyErr_NoMemory(); } else ans = PyUnicode_FromFormat("", type, line, column, value); } else PyErr_NoMemory(); Py_XDECREF(type); Py_XDECREF(line); Py_XDECREF(column); Py_XDECREF(value); Py_XDECREF(unit); return ans; } static PyObject * tokenizer_Token_as_css(tokenizer_Token *self, PyObject *args, PyObject *kwargs) { if (!self->_as_css) { Py_RETURN_NONE; } Py_INCREF(self->_as_css); return self->_as_css; } static PyMemberDef tokenizer_Token_members[] = { {"is_container", T_OBJECT_EX, offsetof(tokenizer_Token, is_container), 0, "False unless this token is a container for other tokens"}, {"type", T_OBJECT_EX, offsetof(tokenizer_Token, type), 0, "The token type"}, {"_as_css", T_OBJECT_EX, offsetof(tokenizer_Token, _as_css), 0, "Internal variable, use as_css() method instead."}, {"value", T_OBJECT_EX, offsetof(tokenizer_Token, value), 0, "The token value"}, {"unit", T_OBJECT_EX, offsetof(tokenizer_Token, unit), 0, "The token unit"}, {"line", T_OBJECT_EX, offsetof(tokenizer_Token, line), 0, "The token line number"}, {"column", T_OBJECT_EX, offsetof(tokenizer_Token, column), 0, "The token column number"}, {NULL} /* Sentinel */ }; static PyMethodDef tokenizer_Token_methods[] = { {"as_css", (PyCFunction)tokenizer_Token_as_css, METH_VARARGS, "as_css() -> Return the CSS representation of this token" }, {NULL} /* Sentinel */ }; static PyTypeObject tokenizer_TokenType = { // {{{ PyVarObject_HEAD_INIT(NULL, 0) /* tp_name */ "tokenizer.Token", /* tp_basicsize */ sizeof(tokenizer_Token), /* tp_itemsize */ 0, /* tp_dealloc */ (destructor) tokenizer_Token_dealloc, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ (reprfunc) tokenizer_Token_repr, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ 0, /* tp_flags */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_doc */ "Token", /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ 0, /* tp_methods */ tokenizer_Token_methods, /* tp_members */ tokenizer_Token_members, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ tokenizer_Token_new, }; // }}} // }}} static PyObject *COMPILED_TOKEN_REGEXPS = NULL, *UNICODE_UNESCAPE = NULL, *NEWLINE_UNESCAPE = NULL, *SIMPLE_UNESCAPE = NULL, *FIND_NEWLINES = NULL, *TOKEN_DISPATCH = NULL; static PyObject *COLON = NULL, *SCOLON = NULL, *LPAR = NULL, *RPAR = NULL, *LBRACE = NULL, *RBRACE = NULL, *LBOX = NULL, *RBOX = NULL, *DELIM_TOK = NULL, *INTEGER = NULL, *STRING_TOK = NULL; static Py_ssize_t BAD_COMMENT, BAD_STRING, PERCENTAGE, DIMENSION, ATKEYWORD, FUNCTION, COMMENT, NUMBER, STRING, IDENT, HASH, URI, DELIM = -1; #define CLEANUP(x) Py_XDECREF((x)); x = NULL; static PyObject* tokenize_cleanup(PyObject *self, PyObject *args) { CLEANUP(COMPILED_TOKEN_REGEXPS); CLEANUP(UNICODE_UNESCAPE); CLEANUP(NEWLINE_UNESCAPE); CLEANUP(SIMPLE_UNESCAPE); CLEANUP(FIND_NEWLINES); CLEANUP(TOKEN_DISPATCH); CLEANUP(COLON); CLEANUP(SCOLON); CLEANUP(LPAR); CLEANUP(RPAR); CLEANUP(LBRACE); CLEANUP(RBRACE); CLEANUP(LBOX); CLEANUP(RBOX); CLEANUP(DELIM_TOK); CLEANUP(INTEGER); CLEANUP(STRING_TOK); Py_RETURN_NONE; } static PyObject* tokenize_init(PyObject *self, PyObject *args) { PyObject *cti = NULL; if (COMPILED_TOKEN_REGEXPS != NULL) { tokenize_cleanup(NULL, NULL); } if (!PyArg_ParseTuple(args, "OOOOOOOOOOOOOOOOOO", &COMPILED_TOKEN_REGEXPS, &UNICODE_UNESCAPE, &NEWLINE_UNESCAPE, &SIMPLE_UNESCAPE, &FIND_NEWLINES, &TOKEN_DISPATCH, &cti, &COLON, &SCOLON, &LPAR, &RPAR, &LBRACE, &RBRACE, &LBOX, &RBOX, &DELIM_TOK, &INTEGER, &STRING_TOK)) return NULL; Py_INCREF(COMPILED_TOKEN_REGEXPS); Py_INCREF(UNICODE_UNESCAPE); Py_INCREF(NEWLINE_UNESCAPE); Py_INCREF(SIMPLE_UNESCAPE); Py_INCREF(FIND_NEWLINES); Py_INCREF(TOKEN_DISPATCH); Py_INCREF(COLON); Py_INCREF(SCOLON); Py_INCREF(LPAR); Py_INCREF(RPAR); Py_INCREF(LBRACE); Py_INCREF(RBRACE); Py_INCREF(LBOX); Py_INCREF(RBOX); Py_INCREF(DELIM_TOK); Py_INCREF(INTEGER); Py_INCREF(STRING_TOK); #define SETCONST(x) do { (x) = PyNumber_AsSsize_t(PyDict_GetItemString(cti, #x), PyExc_OverflowError); \ if((x) == -1 && PyErr_Occurred() != NULL) { return NULL; } \ } while(0) SETCONST(BAD_COMMENT); SETCONST(BAD_STRING); SETCONST(PERCENTAGE); SETCONST(DIMENSION); SETCONST(ATKEYWORD); SETCONST(FUNCTION); SETCONST(COMMENT); SETCONST(NUMBER); SETCONST(STRING); SETCONST(IDENT); SETCONST(HASH); SETCONST(URI); Py_RETURN_NONE; } #if PY_VERSION_HEX >= 0x03030000 #define ITER_CODE_PTS(unicode_object) { \ int _kind = PyUnicode_KIND(unicode_object); \ void *_data = PyUnicode_DATA(unicode_object); \ for (Py_ssize_t iteridx = 0; iteridx < PyUnicode_GET_LENGTH(unicode_object); iteridx++) { \ Py_UCS4 ch = PyUnicode_READ(_kind, _data, iteridx); #else #define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE #define ITER_CODE_PTS(unicode_object) { \ Py_UNICODE *_data = PyUnicode_AS_UNICODE(unicode_object); \ Py_ssize_t iteridx; \ for (iteridx = 0; iteridx < PyUnicode_GET_LENGTH(unicode_object); iteridx++) { \ Py_UNICODE ch = _data[iteridx]; #endif #define END_ITER_CODE_PTS }} static PyObject *unicode_to_number(PyObject *src) { PyObject* ans = PyFloat_FromString(src); double val = PyFloat_AsDouble(ans); long lval = (long)val; if (val - lval != 0) return ans; Py_DECREF(ans); return PyLong_FromLong(lval); } static void lowercase(PyObject *x) { ITER_CODE_PTS(x) if ('A' <= ch && ch <= 'Z') { #if PY_VERSION_HEX >= 0x03030000 PyUnicode_WRITE(_kind, _data, iteridx, ch + 32); #else _data[iteridx] += 32; #endif } END_ITER_CODE_PTS } static PyObject* clone_unicode(const PyObject* src, Py_ssize_t start_offset, Py_ssize_t end_offset) { #if PY_VERSION_HEX >= 0x03030000 int kind = PyUnicode_KIND(src); void *data; switch(kind) { case PyUnicode_1BYTE_KIND: data = PyUnicode_1BYTE_DATA(src) + start_offset; break; case PyUnicode_2BYTE_KIND: data = PyUnicode_2BYTE_DATA(src) + start_offset; break; case PyUnicode_4BYTE_KIND: data = PyUnicode_4BYTE_DATA(src) + start_offset; break; default: PyErr_SetString(PyExc_RuntimeError, "Invalid byte kind for unicode object"); return NULL; } return PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(src) - start_offset - end_offset); #else return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(src) + start_offset, PyUnicode_GET_LENGTH(src) - start_offset - end_offset); #endif } static PyObject* tokenize_flat(PyObject *self, PyObject *args) { #if PY_VERSION_HEX >= 0x03030000 void *css_source = NULL; int css_kind; Py_UCS4 c = 0, codepoint = 0; #define first_char(string) PyUnicode_READ_CHAR(string, 0) #define unicode_from_data(data, sz) PyUnicode_FromKindAndData(css_kind, data, sz) #else Py_UNICODE *css_source = NULL, c = 0, codepoint = 0; #define first_char(string) PyUnicode_AS_UNICODE(string)[0] #define unicode_from_data(data, sz) PyUnicode_FromUnicode(data, sz) #endif PyObject *ic = NULL, *token = NULL, *tokens = NULL, *type_name = NULL, *css_value = NULL, *value = NULL, *unit = NULL, *tries = NULL, *match = NULL, *match_func = NULL, *py_source = NULL, *item = NULL, *newlines = NULL; int ignore_comments = 0; Py_ssize_t pos = 0, line = 1, column = 1, i = 0; Py_ssize_t length = 0, next_pos = 0, type_ = -1, source_len = 0; if (COMPILED_TOKEN_REGEXPS == NULL) { PyErr_SetString(PyExc_RuntimeError, "tokenizer module not initialized. You must call init() first."); return NULL; } if (!PyArg_ParseTuple(args, "UO", &py_source, &ic)) return NULL; if (PyObject_IsTrue(ic)) ignore_comments = 1; #if PY_VERSION_HEX >= 0x03030000 if (PyUnicode_READY(py_source) != 0) return NULL; css_source = PyUnicode_DATA(py_source); css_kind = PyUnicode_KIND(py_source); #else css_source = PyUnicode_AS_UNICODE(py_source); #endif source_len = PyUnicode_GET_LENGTH(py_source); tokens = PyList_New(0); if (tokens == NULL) return PyErr_NoMemory(); #define UNESCAPE(x, func) item = PyObject_CallFunctionObjArgs(func, x, NULL); if (item == NULL) { goto error; } Py_DECREF(x); x = item; item = NULL; #define TONUMBER(x) item = unicode_to_number(x); if (item == NULL) goto error; Py_DECREF(x); x = item; item = NULL; #define SINGLE(x) { type_ = -1; type_name = x; Py_INCREF(type_name); css_value = x; Py_INCREF(css_value); } while (pos < source_len) { #if PY_VERSION_HEX >= 0x03030000 c = PyUnicode_READ(css_kind, css_source, pos); #else c = css_source[pos]; #endif css_value = NULL; type_name = NULL; value = NULL; unit = NULL; match = NULL; if (c == ':') SINGLE(COLON) else if (c == ';') SINGLE(SCOLON) else if (c == '(') SINGLE(LPAR) else if (c == ')') SINGLE(RPAR) else if (c == '{') SINGLE(LBRACE) else if (c == '}') SINGLE(RBRACE) else if (c == '[') SINGLE(LBOX) else if (c == ']') SINGLE(RBOX) else { codepoint = (c > 160) ? 160: c; tries = PyList_GET_ITEM(TOKEN_DISPATCH, codepoint); for (i = 0; i < PyList_Size(tries); i++) { item = PyList_GET_ITEM(tries, i); match_func = PyTuple_GET_ITEM(item, 2); match = PyObject_CallFunction(match_func, "On", py_source, pos); if (match == NULL) { goto error; } if (match != Py_None) { css_value = PyObject_CallMethod(match, "group", NULL); if (css_value == NULL) { goto error; } type_ = PyNumber_AsSsize_t(PyTuple_GET_ITEM(item, 0), PyExc_OverflowError); if(type_ == -1 && PyErr_Occurred() != NULL) { goto error; } type_name = PyTuple_GET_ITEM(item, 1); Py_INCREF(type_name); break; } } if (css_value == NULL) { // No match type_ = DELIM; type_name = DELIM_TOK; Py_INCREF(type_name); css_value = unicode_from_data(&c, 1); if (css_value == NULL) { goto error; } } } length = PyUnicode_GET_LENGTH(css_value); next_pos = pos + length; // Now calculate the value and unit for this token (if any) if (! (ignore_comments && (type_ == COMMENT || type_ == BAD_COMMENT))) { if (type_ == DIMENSION) { value = PyObject_CallMethod(match, "group", "I", 1); if (value == NULL) { goto error; } TONUMBER(value); unit = PyObject_CallMethod(match, "group", "I", 2); if (unit == NULL) { goto error; } UNESCAPE(unit, SIMPLE_UNESCAPE); UNESCAPE(unit, UNICODE_UNESCAPE); lowercase(unit); } else if (type_ == PERCENTAGE) { if (PyUnicode_GET_LENGTH(css_value) > 0) { value = clone_unicode(css_value, 0, 1); if (value == NULL) goto error; } else { value = css_value; Py_INCREF(value); } if (value == NULL) goto error; TONUMBER(value); unit = PyUnicode_FromString("%"); if (unit == NULL) goto error; } else if (type_ == NUMBER) { value = css_value; Py_INCREF(value); TONUMBER(value); if (!PyFloat_Check(value)) { Py_XDECREF(type_name); type_name = INTEGER; Py_INCREF(type_name); } } else if (type_ == IDENT || type_ == ATKEYWORD || type_ == HASH || type_ == FUNCTION) { value = PyObject_CallFunctionObjArgs(SIMPLE_UNESCAPE, css_value, NULL); if (value == NULL) goto error; UNESCAPE(value, UNICODE_UNESCAPE); } else if (type_ == URI) { value = PyObject_CallMethod(match, "group", "I", 1); if (value == NULL) { goto error; } if (PyObject_IsTrue(value) && PyUnicode_GET_LENGTH(value) > 1 && (first_char(value) == '"' || first_char(value) == '\'')) { item = clone_unicode(value, 1, 1); if (item == NULL) goto error; Py_DECREF(value); value = item; item = NULL; UNESCAPE(value, NEWLINE_UNESCAPE); } UNESCAPE(value, SIMPLE_UNESCAPE); UNESCAPE(value, UNICODE_UNESCAPE); } else if (type_ == STRING) { if (PyObject_IsTrue(css_value) && PyUnicode_GET_LENGTH(css_value) > 1) { // remove quotes value = clone_unicode(css_value, 1, 1); } else { value = css_value; Py_INCREF(value); } UNESCAPE(value, NEWLINE_UNESCAPE); UNESCAPE(value, SIMPLE_UNESCAPE); UNESCAPE(value, UNICODE_UNESCAPE); } else if (type_ == BAD_STRING && next_pos == source_len) { Py_XDECREF(type_name); type_name = STRING_TOK; Py_INCREF(type_name); if (PyObject_IsTrue(css_value) && PyUnicode_GET_LENGTH(css_value) > 0) { // remove quote value = clone_unicode(css_value, 1, 0); } else { value = css_value; Py_INCREF(value); } UNESCAPE(value, NEWLINE_UNESCAPE); UNESCAPE(value, SIMPLE_UNESCAPE); UNESCAPE(value, UNICODE_UNESCAPE); } else { value = css_value; Py_INCREF(value); } // if(type_ == ...) if (unit == NULL) { unit = Py_None; Py_INCREF(unit); } item = Py_BuildValue("OOOOnn", type_name, css_value, value, unit, line, column); if (item == NULL) goto error; token = PyObject_CallObject((PyObject *) &tokenizer_TokenType, item); Py_DECREF(item); item = NULL; if (token == NULL) goto error; if (PyList_Append(tokens, token) != 0) { Py_DECREF(token); token = NULL; goto error; } Py_DECREF(token); } // if(!(ignore_comments... Py_XDECREF(match); match = NULL; pos = next_pos; newlines = PyObject_CallFunctionObjArgs(FIND_NEWLINES, css_value, NULL); if (newlines == NULL) goto error; Py_XDECREF(css_value); css_value = NULL; Py_XDECREF(type_name); type_name = NULL; Py_XDECREF(value); value = NULL; Py_XDECREF(unit); unit = NULL; if (PyObject_IsTrue(newlines)) { line += PyList_Size(newlines); item = PyObject_CallMethod(PyList_GET_ITEM(newlines, PyList_Size(newlines) - 1), "end", NULL); if (item == NULL) { Py_DECREF(newlines); newlines = NULL; goto error; } column = PyNumber_AsSsize_t(item, PyExc_OverflowError); if(column == -1 && PyErr_Occurred()) { Py_DECREF(newlines); newlines = NULL; goto error; } column = length - column + 1; Py_DECREF(item); item = NULL; } else column += length; Py_DECREF(newlines); newlines = NULL; } // while (pos < ...) return tokens; error: Py_XDECREF(tokens); Py_XDECREF(css_value); Py_XDECREF(type_name); Py_XDECREF(value); Py_XDECREF(unit); Py_XDECREF(match); return NULL; #undef unicode_from_data #undef first_char } static PyMethodDef tokenizer_methods[] = { {"tokenize_flat", tokenize_flat, METH_VARARGS, "tokenize_flat(css_source, ignore_comments)\n\n Convert CSS source into a flat list of tokens" }, {"init", tokenize_init, METH_VARARGS, "init()\n\nInitialize the module." }, {"cleanup", tokenize_cleanup, METH_VARARGS, "cleanup()\n\nRelease resources allocated by init(). Safe to call multiple times." }, {NULL, NULL, 0, NULL} }; static int exec_module(PyObject *mod) { if (PyType_Ready(&tokenizer_TokenType) < 0) return -1; Py_INCREF(&tokenizer_TokenType); PyModule_AddObject(mod, "Token", (PyObject *) &tokenizer_TokenType); return 0; } static PyModuleDef_Slot slots[] = { {Py_mod_exec, exec_module}, {0, NULL} }; static struct PyModuleDef module_def = { .m_base = PyModuleDef_HEAD_INIT, .m_name = "tokenizer", .m_doc = "Implementation of tokenizer in C for speed.", .m_methods = tokenizer_methods, .m_slots = slots, }; CALIBRE_MODINIT_FUNC PyInit_tokenizer(void) { return PyModuleDef_Init(&module_def); }