diff --git a/setup/extensions.py b/setup/extensions.py index b9d92f6399..43c13d3aa1 100644 --- a/setup/extensions.py +++ b/setup/extensions.py @@ -89,6 +89,10 @@ extensions = [ ['calibre/utils/speedup.c'], ), + Extension('tokenizer', + ['tinycss/tokenizer.c'], + ), + Extension('_patiencediff_c', ['calibre/gui2/tweak_book/diff/_patiencediff_c.c'], ), @@ -343,8 +347,8 @@ if iswindows: cc = cxx = msvc.cc cflags = '/c /nologo /MD /W3 /EHsc /DNDEBUG'.split() ldflags = '/DLL /nologo /INCREMENTAL:NO /NODEFAULTLIB:libcmt.lib'.split() - #cflags = '/c /nologo /Ox /MD /W3 /EHsc /Zi'.split() - #ldflags = '/DLL /nologo /INCREMENTAL:NO /DEBUG'.split() + # cflags = '/c /nologo /Ox /MD /W3 /EHsc /Zi'.split() + # ldflags = '/DLL /nologo /INCREMENTAL:NO /DEBUG'.split() if is64bit: cflags.append('/GS-') @@ -469,8 +473,8 @@ class Build(Command): self.info('\n\n', ' '.join(cmd), '\n\n') self.check_call(cmd) if iswindows: - #manifest = dest+'.manifest' - #cmd = [MT, '-manifest', manifest, '-outputresource:%s;2'%dest] + # manifest = dest+'.manifest' + # cmd = [MT, '-manifest', manifest, '-outputresource:%s;2'%dest] # self.info(*cmd) # self.check_call(cmd) # os.remove(manifest) diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 86830e91ce..e9ac8200dc 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -144,6 +144,7 @@ class Plugins(collections.Mapping): '_patiencediff_c', 'bzzdec', 'matcher', + 'tokenizer', ] if iswindows: plugins.extend(['winutil', 'wpd', 'winfonts']) diff --git a/src/tinycss/token_data.py b/src/tinycss/token_data.py index dcd9232299..40bff86e77 100644 --- a/src/tinycss/token_data.py +++ b/src/tinycss/token_data.py @@ -210,7 +210,7 @@ SIMPLE_UNESCAPE = functools.partial( # Same as r'\1', but faster on CPython operator.methodcaller('group', 1)) -FIND_NEWLINES = re.compile(COMPILED_MACROS['nl']).finditer +FIND_NEWLINES = lambda x : list(re.compile(COMPILED_MACROS['nl']).finditer(x)) class Token(object): @@ -439,3 +439,12 @@ class TokenList(list): as parsed in the source. """ return ''.join(token.as_css() for token in self) + +def load_c_tokenizer(): + from calibre.constants import plugins + tokenizer, err = plugins['tokenizer'] + if err: + raise RuntimeError('Failed to load module tokenizer: %s' % err) + tokens = list(':;(){}[]') + ['DELIM', 'INTEGER', 'STRING'] + tokenizer.init(COMPILED_TOKEN_REGEXPS, UNICODE_UNESCAPE, NEWLINE_UNESCAPE, SIMPLE_UNESCAPE, FIND_NEWLINES, TOKEN_DISPATCH, COMPILED_TOKEN_INDEXES, *tokens) + return tokenizer diff --git a/src/tinycss/tokenizer.c b/src/tinycss/tokenizer.c new file mode 100644 index 0000000000..71eeb65e19 --- /dev/null +++ b/src/tinycss/tokenizer.c @@ -0,0 +1,424 @@ +/* + * tokenizer.c + * Copyright (C) 2014 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#define UNICODE +#define PY_SSIZE_T_CLEAN +#include +#include + +// Token type definition {{{ +typedef struct { + PyObject_HEAD + // Type-specific fields go here. + PyObject *type; + PyObject *_as_css; + PyObject *value; + PyObject *unit; + PyObject *line; + PyObject *column; + +} tokenizer_Token; + +static void +tokenizer_Token_dealloc(tokenizer_Token* self) +{ + Py_XDECREF(self->type); self->type = NULL; + Py_XDECREF(self->_as_css); self->_as_css = NULL; + Py_XDECREF(self->value); self->value = NULL; + Py_XDECREF(self->unit); self->unit = NULL; + Py_XDECREF(self->line); self->line = NULL; + Py_XDECREF(self->column); self->column = NULL; + self->ob_type->tp_free((PyObject*)self); +} + + +static PyObject * +tokenizer_Token_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + tokenizer_Token *self = NULL; + self = (tokenizer_Token *)type->tp_alloc(type, 0); + if (self == NULL) return PyErr_NoMemory(); + + if (!PyArg_ParseTuple(args, "OOOOOO", &(self->type), &(self->_as_css), &(self->value), &(self->unit), &(self->line), &(self->column))) { + self->ob_type->tp_free((PyObject*)self); return NULL; + } + Py_INCREF(self->type); Py_INCREF(self->_as_css); Py_INCREF(self->value); Py_INCREF(self->unit); Py_INCREF(self->line); Py_INCREF(self->column); + + return (PyObject *)self; +} + +static PyObject * +tokenizer_Token_repr(tokenizer_Token *self) { + PyObject *type = NULL, *line = NULL, *column = NULL, *value = NULL, *ans = NULL, *unit = NULL; + if (!self->type || !self->line || !self->column || !self->value) + return PyBytes_FromString(""); + type = PyObject_Unicode(self->type); line = PyObject_Unicode(self->line); column = PyObject_Unicode(self->column); value = PyObject_Unicode(self->value); + if (type && line && column && value) { + if (self->unit != NULL && PyObject_IsTrue(self->unit)) { + unit = PyObject_Unicode(self->unit); + if (unit != NULL) + ans = PyUnicode_FromFormat("", type, line, column, value, unit); + else + PyErr_NoMemory(); + } else + ans = PyUnicode_FromFormat("", type, line, column, value); + } else PyErr_NoMemory(); + Py_XDECREF(type); Py_XDECREF(line); Py_XDECREF(column); Py_XDECREF(value); Py_XDECREF(unit); + return ans; +} + +static PyObject * +tokenizer_Token_as_css(tokenizer_Token *self, PyObject *args, PyObject *kwargs) { + if (!self->_as_css) { + Py_RETURN_NONE; + } + Py_INCREF(self->_as_css); + return self->_as_css; +} + +static PyMemberDef tokenizer_Token_members[] = { + {"type", T_OBJECT_EX, offsetof(tokenizer_Token, type), 0, "The token type"}, + {"_as_css", T_OBJECT_EX, offsetof(tokenizer_Token, _as_css), 0, "Internal variable, use as_css() method instead."}, + {"value", T_OBJECT_EX, offsetof(tokenizer_Token, value), 0, "The token value"}, + {"unit", T_OBJECT_EX, offsetof(tokenizer_Token, unit), 0, "The token unit"}, + {"line", T_OBJECT_EX, offsetof(tokenizer_Token, line), 0, "The token line number"}, + {"column", T_OBJECT_EX, offsetof(tokenizer_Token, column), 0, "The token column number"}, + {NULL} /* Sentinel */ +}; + +static PyMethodDef tokenizer_Token_methods[] = { + {"as_css", (PyCFunction)tokenizer_Token_as_css, METH_VARARGS, + "as_css() -> Return the CSS representation of this token" + }, + + {NULL} /* Sentinel */ +}; + +static PyTypeObject tokenizer_TokenType = { // {{{ + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "tokenizer.Token", /*tp_name*/ + sizeof(tokenizer_Token), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)tokenizer_Token_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + (reprfunc)tokenizer_Token_repr, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "Token", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + tokenizer_Token_methods, /* tp_methods */ + tokenizer_Token_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + tokenizer_Token_new, /* tp_new */ +}; // }}} +// }}} + +static PyObject *COMPILED_TOKEN_REGEXPS = NULL, *UNICODE_UNESCAPE = NULL, *NEWLINE_UNESCAPE = NULL, *SIMPLE_UNESCAPE = NULL, *FIND_NEWLINES = NULL, *TOKEN_DISPATCH = NULL; +static PyObject *COLON = NULL, *SCOLON = NULL, *LPAR = NULL, *RPAR = NULL, *LBRACE = NULL, *RBRACE = NULL, *LBOX = NULL, *RBOX = NULL, *DELIM_TOK = NULL, *INTEGER = NULL, *STRING_TOK = NULL; + +static Py_ssize_t BAD_COMMENT, BAD_STRING, PERCENTAGE, DIMENSION, ATKEYWORD, FUNCTION, COMMENT, NUMBER, STRING, IDENT, HASH, URI, DELIM = -1; + +#define CLEANUP(x) Py_XDECREF((x)); x = NULL; + +static PyObject* +tokenize_cleanup(PyObject *self, PyObject *args) { + CLEANUP(COMPILED_TOKEN_REGEXPS); CLEANUP(UNICODE_UNESCAPE); CLEANUP(NEWLINE_UNESCAPE); CLEANUP(SIMPLE_UNESCAPE); CLEANUP(FIND_NEWLINES); CLEANUP(TOKEN_DISPATCH); + CLEANUP(COLON); CLEANUP(SCOLON); CLEANUP(LPAR); CLEANUP(RPAR); CLEANUP(LBRACE); CLEANUP(RBRACE); CLEANUP(LBOX); CLEANUP(RBOX); CLEANUP(DELIM_TOK); CLEANUP(INTEGER); CLEANUP(STRING_TOK); + Py_RETURN_NONE; +} + +static PyObject* +tokenize_init(PyObject *self, PyObject *args) { + PyObject *cti = NULL; + + if (COMPILED_TOKEN_REGEXPS != NULL) { + tokenize_cleanup(NULL, NULL); + } + if (!PyArg_ParseTuple(args, "OOOOOOOOOOOOOOOOOO", &COMPILED_TOKEN_REGEXPS, &UNICODE_UNESCAPE, &NEWLINE_UNESCAPE, &SIMPLE_UNESCAPE, &FIND_NEWLINES, &TOKEN_DISPATCH, &cti, &COLON, &SCOLON, &LPAR, &RPAR, &LBRACE, &RBRACE, &LBOX, &RBOX, &DELIM_TOK, &INTEGER, &STRING_TOK)) return NULL; + Py_INCREF(COMPILED_TOKEN_REGEXPS); Py_INCREF(UNICODE_UNESCAPE); Py_INCREF(NEWLINE_UNESCAPE); Py_INCREF(SIMPLE_UNESCAPE); Py_INCREF(FIND_NEWLINES); Py_INCREF(TOKEN_DISPATCH); + Py_INCREF(COLON); Py_INCREF(SCOLON); Py_INCREF(LPAR); Py_INCREF(RPAR); Py_INCREF(LBRACE); Py_INCREF(RBRACE); Py_INCREF(LBOX); Py_INCREF(RBOX); Py_INCREF(DELIM_TOK); Py_INCREF(INTEGER); Py_INCREF(STRING_TOK); + +#define SETCONST(x) x = PyInt_AsSsize_t(PyDict_GetItemString(cti, #x)) + SETCONST(BAD_COMMENT); SETCONST(BAD_STRING); SETCONST(PERCENTAGE); SETCONST(DIMENSION); SETCONST(ATKEYWORD); SETCONST(FUNCTION); SETCONST(COMMENT); SETCONST(NUMBER); SETCONST(STRING); SETCONST(IDENT); SETCONST(HASH); SETCONST(URI); + + Py_RETURN_NONE; +} + +static int +contains_char(PyObject *haystack, Py_UNICODE c) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + Py_ssize_t i = 0; + Py_UNICODE *data = PyUnicode_AS_UNICODE(haystack); + for (i = 0; i < PyUnicode_GET_SIZE(haystack); i++) { + if (data[i] == c) return 1; + } + return 0; +} + +static PyObject *unicode_to_number(PyObject *src) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + PyObject *raw = NULL, *ans = NULL; + raw = PyUnicode_AsASCIIString(src); + if (raw == NULL) { return NULL; } + if (contains_char(src, '.')) { + ans = PyFloat_FromString(raw, NULL); + } else { + ans = PyInt_FromString(PyString_AS_STRING(raw), NULL, 10); + } + Py_DECREF(raw); + return ans; +} + +static void lowercase(PyObject *x) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + Py_ssize_t i = 0; + Py_UNICODE *data = PyUnicode_AS_UNICODE(x); + for (i = 0; i < PyUnicode_GET_SIZE(x); i++) + data[i] = Py_UNICODE_TOLOWER(data[i]); +} + +static PyObject* clone_unicode(Py_UNICODE *x, Py_ssize_t sz) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + PyObject *ans = PyUnicode_FromUnicode(NULL, sz); + if (ans == NULL) return PyErr_NoMemory(); + memcpy(PyUnicode_AS_UNICODE(ans), x, sz); + return ans; +} + +static PyObject* +tokenize_flat(PyObject *self, PyObject *args) { +#if PY_VERSION_HEX >= 0x03030000 +#error Not implemented for python >= 3.3 +#endif + Py_UNICODE *css_source = NULL, c = 0, codepoint = 0; + PyObject *ic = NULL, *token = NULL, *tokens = NULL, *type_name = NULL, *css_value = NULL, *value = NULL, *unit = NULL, *tries = NULL, *match = NULL, *match_func = NULL, *py_source = NULL, *item = NULL, *newlines = NULL; + int ignore_comments = 0; + Py_ssize_t pos = 0, line = 1, column = 1, i = 0; + Py_ssize_t length = 0, next_pos = 0, type_ = -1, source_len = 0; + + + if (COMPILED_TOKEN_REGEXPS == NULL) { + PyErr_SetString(PyExc_RuntimeError, "tokenizer module not initialized. You must call init() first."); return NULL; + } + + if (!PyArg_ParseTuple(args, "U|O", &py_source, &ic)) return NULL; + if (ic != NULL && PyObject_IsTrue(ic)) ignore_comments = 1; + source_len = PyUnicode_GET_SIZE(py_source); + css_source = PyUnicode_AS_UNICODE(py_source); + + tokens = PyList_New(0); + if (tokens == NULL) return PyErr_NoMemory(); + +#define UNESCAPE(x, func) item = PyObject_CallFunctionObjArgs(func, x, NULL); if (item == NULL) { goto error; } Py_DECREF(x); x = item; item = NULL; + +#define TONUMBER(x) item = unicode_to_number(x); if (item == NULL) goto error; Py_DECREF(x); x = item; item = NULL; + +#define SINGLE(x) { type_ = -1; type_name = x; Py_INCREF(type_name); css_value = x; Py_INCREF(css_value); } + + while (pos < source_len) { + c = css_source[pos]; + + css_value = NULL; type_name = NULL; value = NULL; unit = NULL; match = NULL; + + if (c == ':') SINGLE(COLON) else if (c == ';') SINGLE(SCOLON) else if (c == '(') SINGLE(LPAR) else if (c == ')') SINGLE(RPAR) else if (c == '{') SINGLE(LBRACE) else if (c == '}') SINGLE(RBRACE) else if (c == '[') SINGLE(LBOX) else if (c == ']') SINGLE(RBOX) else + { + codepoint = (c > 160) ? 160: c; + tries = PyList_GET_ITEM(TOKEN_DISPATCH, codepoint); + for (i = 0; i < PyList_Size(tries); i++) { + item = PyList_GET_ITEM(tries, i); + match_func = PyTuple_GET_ITEM(item, 2); + match = PyObject_CallFunction(match_func, "On", py_source, pos); + if (match == NULL) { goto error; } + if (match != Py_None) { + css_value = PyObject_CallMethod(match, "group", NULL); + if (css_value == NULL) { goto error; } + type_ = PyInt_AsSsize_t(PyTuple_GET_ITEM(item, 0)); + type_name = PyTuple_GET_ITEM(item, 1); + Py_INCREF(type_name); + break; + } + } + if (css_value == NULL) { // No match + type_ = DELIM; type_name = DELIM_TOK; Py_INCREF(type_name); css_value = clone_unicode(&c, 1); + if (css_value == NULL) { goto error; } + } + } + + length = PyUnicode_GET_SIZE(css_value); + next_pos = pos + length; + + // Now calculate the value and unit for this token (if any) + if (! (ignore_comments && (type_ == COMMENT || type_ == BAD_COMMENT))) { + if (type_ == DIMENSION) { + value = PyObject_CallMethod(match, "group", "I", 1); + if (value == NULL) { goto error; } + TONUMBER(value); + unit = PyObject_CallMethod(match, "group", "I", 2); + if (unit == NULL) { goto error; } + UNESCAPE(unit, SIMPLE_UNESCAPE); + UNESCAPE(unit, UNICODE_UNESCAPE); + lowercase(unit); + } else + + if (type_ == PERCENTAGE) { + if (PyUnicode_GET_SIZE(css_value) > 0) { + value = clone_unicode(PyUnicode_AS_UNICODE(css_value), PyUnicode_GET_SIZE(css_value) - 1); + if (value == NULL) goto error; + } + else { value = css_value; Py_INCREF(value); } + if (value == NULL) goto error; + TONUMBER(value); + unit = PyUnicode_FromString("%"); + if (unit == NULL) goto error; + } else + + if (type_ == NUMBER) { + value = css_value; Py_INCREF(value); + TONUMBER(value); + if (!PyFloat_Check(value)) { + Py_XDECREF(type_name); + type_name = INTEGER; + Py_INCREF(type_name); + } + } else + + if (type_ == IDENT || type_ == ATKEYWORD || type_ == HASH || type_ == FUNCTION) { + value = PyObject_CallFunctionObjArgs(SIMPLE_UNESCAPE, css_value, NULL); + if (value == NULL) goto error; + UNESCAPE(value, UNICODE_UNESCAPE); + } else + + if (type_ == URI) { + value = PyObject_CallMethod(match, "group", "I", 1); + if (value == NULL) { goto error; } + if (PyObject_IsTrue(value) && PyUnicode_GET_SIZE(value) > 1 && (PyUnicode_AS_UNICODE(value)[0] == '"' || PyUnicode_AS_UNICODE(value)[0] == '\'')) { + item = clone_unicode(PyUnicode_AS_UNICODE(value) + 1, PyUnicode_GET_SIZE(value) - 2); + if (item == NULL) goto error; + Py_DECREF(value); value = item; item = NULL; + } + } else + + if (type_ == STRING) { + if (PyObject_IsTrue(css_value) && PyUnicode_GET_SIZE(css_value) > 1) { // remove quotes + value = clone_unicode(PyUnicode_AS_UNICODE(css_value) + 1, PyUnicode_GET_SIZE(css_value) - 2); + } else { + value = css_value; Py_INCREF(value); + } + UNESCAPE(value, NEWLINE_UNESCAPE); + UNESCAPE(value, SIMPLE_UNESCAPE); + UNESCAPE(value, UNICODE_UNESCAPE); + } else + + if (type_ == BAD_STRING && next_pos == source_len) { + Py_XDECREF(type_name); type_name = STRING_TOK; Py_INCREF(type_name); + if (PyObject_IsTrue(css_value) && PyUnicode_GET_SIZE(css_value) > 0) { // remove quote + value = clone_unicode(PyUnicode_AS_UNICODE(css_value) + 1, PyUnicode_GET_SIZE(css_value) - 1); + } else { + value = css_value; Py_INCREF(value); + } + UNESCAPE(value, NEWLINE_UNESCAPE); + UNESCAPE(value, SIMPLE_UNESCAPE); + UNESCAPE(value, UNICODE_UNESCAPE); + } else { + value = css_value; Py_INCREF(value); + } // if(type_ == ...) + + if (unit == NULL) { unit = Py_None; Py_INCREF(unit); } + item = Py_BuildValue("OOOOnn", type_name, css_value, value, unit, line, column); + if (item == NULL) goto error; + token = PyObject_CallObject((PyObject *) &tokenizer_TokenType, item); + Py_DECREF(item); item = NULL; + if (token == NULL) goto error; + if (PyList_Append(tokens, token) != 0) { Py_DECREF(token); token = NULL; goto error; } + + } // if(!(ignore_comments... + + Py_XDECREF(match); match = NULL; + + pos = next_pos; + newlines = PyObject_CallFunctionObjArgs(FIND_NEWLINES, css_value, NULL); + if (newlines == NULL) goto error; + Py_XDECREF(css_value); css_value = NULL; Py_XDECREF(type_name); type_name = NULL; Py_XDECREF(value); value = NULL; Py_XDECREF(unit); unit = NULL; + if (PyObject_IsTrue(newlines)) { + line += PyList_Size(newlines); + item = PyObject_CallMethod(PyList_GET_ITEM(newlines, PyList_Size(newlines) - 1), "end", NULL); + if (item == NULL) { Py_DECREF(newlines); newlines = NULL; goto error; } + column = length - PyInt_AsSsize_t(item) + 1; + Py_DECREF(item); item = NULL; + } else column += length; + Py_DECREF(newlines); newlines = NULL; + + } // while (pos < ...) + + return tokens; +error: + Py_XDECREF(tokens); Py_XDECREF(css_value); Py_XDECREF(type_name); Py_XDECREF(value); Py_XDECREF(unit); Py_XDECREF(match); + return NULL; +} + +static PyMethodDef tokenizer_methods[] = { + {"tokenize_flat", tokenize_flat, METH_VARARGS, + "tokenize_flat()\n\n" + }, + + {"init", tokenize_init, METH_VARARGS, + "init()\n\nInitialize the module." + }, + + {"cleanup", tokenize_cleanup, METH_VARARGS, + "cleanup()\n\nRelease resources allocated by init(). Safe to call multiple times." + }, + + {NULL, NULL, 0, NULL} +}; + + +PyMODINIT_FUNC +inittokenizer(void) { + PyObject *m; + if (PyType_Ready(&tokenizer_TokenType) < 0) + return; + + m = Py_InitModule3("tokenizer", tokenizer_methods, + "Implementation of tokenizer in C for speed." + ); + if (m == NULL) return; + Py_INCREF(&tokenizer_TokenType); + PyModule_AddObject(m, "Token", (PyObject *)&tokenizer_TokenType); +} diff --git a/src/tinycss/tokenizer.py b/src/tinycss/tokenizer.py index 17a78a9bb0..e5dd02891f 100644 --- a/src/tinycss/tokenizer.py +++ b/src/tinycss/tokenizer.py @@ -127,7 +127,7 @@ def tokenize_flat(css_source, ignore_comments=True, tokens.append(Token(type_, css_value, value, unit, line, column)) pos = next_pos - newlines = list(find_newlines(css_value)) + newlines = find_newlines(css_value) if newlines: line += len(newlines) # Add 1 to have lines start at column 1, not 0