Merge branch 'ft-py3-tokenizer' of https://github.com/flaviut/calibre

2025-07-09 03:04:10 -04:00 · 2018-12-10 11:24:10 +05:30 · 2018-12-10 11:24:10 +05:30 · 36647d4762
commit 36647d4762
parent dca2d1a51c 5f420d7047
3 changed files with 203 additions and 107 deletions
--- a/.gitignore
+++ b/.gitignore
@ -52,3 +52,4 @@ recipes/debug
 /.metadata/
 .idea
 /*env*/
+cmake-build-*
--- a/src/calibre/gui2/tweak_book/editor/syntax/html.c
+++ b/src/calibre/gui2/tweak_book/editor/syntax/html.c
@ -386,6 +386,18 @@ html_init(PyObject *self, PyObject *args) {
    Py_RETURN_NONE;
 }

+static inline long number_to_long(PyObject *number) {
+#if PY_VERSION_HEX >= 0x03030000
+    return PyLong_AsLong(number);
+#else
+    if(PyInt_Check(number)) {
+        return PyInt_AS_LONG(number);
+    } else {
+        return PyLong_AsLong(number);
+    }
+#endif
+}
+
 static PyObject*
 html_check_spelling(PyObject *self, PyObject *args) {
    PyObject *ans = NULL, *temp = NULL, *items = NULL, *text = NULL, *fmt = NULL, *locale = NULL, *sfmt = NULL, *_store_locale = NULL, *t = NULL, *utmp = NULL;
@ -410,9 +422,9 @@ html_check_spelling(PyObject *self, PyObject *args) {

    for (i = 0, j = 0; i < PyList_GET_SIZE(items); i++) {
        temp = PyList_GET_ITEM(items, i);
-        start = PyLong_AsLong(PyTuple_GET_ITEM(temp, 0));
+        start = number_to_long(PyTuple_GET_ITEM(temp, 0));
        if(start == -1 && PyErr_Occurred() != NULL) goto error;
-        length = PyLong_AsLong(PyTuple_GET_ITEM(temp, 1));
+        length = number_to_long(PyTuple_GET_ITEM(temp, 1));
        if(length == -1 && PyErr_Occurred() != NULL) goto error;
        temp = NULL;

--- a/src/tinycss/tokenizer.c
+++ b/src/tinycss/tokenizer.c
@ -34,7 +34,7 @@ tokenizer_Token_dealloc(tokenizer_Token* self)
    Py_XDECREF(self->unit); self->unit = NULL;
    Py_XDECREF(self->line); self->line = NULL;
    Py_XDECREF(self->column); self->column = NULL;
-    self->ob_type->tp_free((PyObject*)self);
+    Py_TYPE(self)->tp_free((PyObject*)self);
 }


@ -46,7 +46,8 @@ tokenizer_Token_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
    if (self == NULL) return PyErr_NoMemory();

    if (!PyArg_ParseTuple(args, "OOOOOO", &(self->type), &(self->_as_css), &(self->value), &(self->unit), &(self->line), &(self->column))) {
-        self->ob_type->tp_free((PyObject*)self); return NULL;
+        Py_TYPE(self)->tp_free((PyObject *) self);
+        return NULL;
    }
    Py_INCREF(self->type); Py_INCREF(self->_as_css); Py_INCREF(self->value); Py_INCREF(self->unit); Py_INCREF(self->line); Py_INCREF(self->column);
    self->is_container = Py_False; Py_INCREF(self->is_container);
@ -54,16 +55,25 @@ tokenizer_Token_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
    return (PyObject *)self;
 }

+#if PY_MAJOR_VERSION >= 3
+#define PyObject_Unicode_Compat(arg) PyObject_Str(arg)
+#else
+#define PyObject_Unicode_Compat(arg) PyObject_Unicode(arg)
+#endif
+
 static PyObject *
 tokenizer_Token_repr(tokenizer_Token *self) {
    PyObject *type = NULL, *line = NULL, *column = NULL, *value = NULL, *ans = NULL, *unit = NULL;
-    if (!self->type || !self->line || !self->column || !self->value) 
+    if (!self->type || !self->line || !self->column || !self->value)
        return PyBytes_FromString("<Token NULL fields>");
-    type = PyObject_Unicode(self->type); line = PyObject_Unicode(self->line); column = PyObject_Unicode(self->column); value = PyObject_Unicode(self->value);
+    type = PyObject_Unicode_Compat(self->type);
+    line = PyObject_Unicode_Compat(self->line);
+    column = PyObject_Unicode_Compat(self->column);
+    value = PyObject_Unicode_Compat(self->value);
    if (type && line && column && value) {
        if (self->unit != NULL && PyObject_IsTrue(self->unit)) {
-            unit = PyObject_Unicode(self->unit);
-            if (unit != NULL) 
+            unit = PyObject_Unicode_Compat(self->unit);
+            if (unit != NULL)
                ans = PyUnicode_FromFormat("<Token %U at %U:%U %U%U>", type, line, column, value, unit);
            else
                PyErr_NoMemory();
@ -103,58 +113,57 @@ static PyMethodDef tokenizer_Token_methods[] = {
 };

 static PyTypeObject tokenizer_TokenType = { // {{{
-    PyObject_HEAD_INIT(NULL)
-    0,                         /*ob_size*/
-    "tokenizer.Token",            /*tp_name*/
-    sizeof(tokenizer_Token),      /*tp_basicsize*/
-    0,                         /*tp_itemsize*/
-    (destructor)tokenizer_Token_dealloc, /*tp_dealloc*/
-    0,                         /*tp_print*/
-    0,                         /*tp_getattr*/
-    0,                         /*tp_setattr*/
-    0,                         /*tp_compare*/
-    (reprfunc)tokenizer_Token_repr,                         /*tp_repr*/
-    0,                         /*tp_as_number*/
-    0,                         /*tp_as_sequence*/
-    0,                         /*tp_as_mapping*/
-    0,                         /*tp_hash */
-    0,                         /*tp_call*/
-    0,                         /*tp_str*/
-    0,                         /*tp_getattro*/
-    0,                         /*tp_setattro*/
-    0,                         /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,        /*tp_flags*/
-    "Token",                  /* tp_doc */
-    0,		               /* tp_traverse */
-    0,		               /* tp_clear */
-    0,		               /* tp_richcompare */
-    0,		               /* tp_weaklistoffset */
-    0,		               /* tp_iter */
-    0,		               /* tp_iternext */
-    tokenizer_Token_methods,             /* tp_methods */
-    tokenizer_Token_members,             /* tp_members */
-    0,                         /* tp_getset */
-    0,                         /* tp_base */
-    0,                         /* tp_dict */
-    0,                         /* tp_descr_get */
-    0,                         /* tp_descr_set */
-    0,                         /* tp_dictoffset */
-    0,      /* tp_init */
-    0,                         /* tp_alloc */
-    tokenizer_Token_new,                 /* tp_new */
+        PyVarObject_HEAD_INIT(NULL, 0)
+        /* tp_name           */ "tokenizer.Token",
+        /* tp_basicsize      */ sizeof(tokenizer_Token),
+        /* tp_itemsize       */ 0,
+        /* tp_dealloc        */ (destructor) tokenizer_Token_dealloc,
+        /* tp_print          */ 0,
+        /* tp_getattr        */ 0,
+        /* tp_setattr        */ 0,
+        /* tp_compare        */ 0,
+        /* tp_repr           */ (reprfunc) tokenizer_Token_repr,
+        /* tp_as_number      */ 0,
+        /* tp_as_sequence    */ 0,
+        /* tp_as_mapping     */ 0,
+        /* tp_hash           */ 0,
+        /* tp_call           */ 0,
+        /* tp_str            */ 0,
+        /* tp_getattro       */ 0,
+        /* tp_setattro       */ 0,
+        /* tp_as_buffer      */ 0,
+        /* tp_flags          */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
+        /* tp_doc            */ "Token",
+        /* tp_traverse       */ 0,
+        /* tp_clear          */ 0,
+        /* tp_richcompare    */ 0,
+        /* tp_weaklistoffset */ 0,
+        /* tp_iter           */ 0,
+        /* tp_iternext       */ 0,
+        /* tp_methods        */ tokenizer_Token_methods,
+        /* tp_members        */ tokenizer_Token_members,
+        /* tp_getset         */ 0,
+        /* tp_base           */ 0,
+        /* tp_dict           */ 0,
+        /* tp_descr_get      */ 0,
+        /* tp_descr_set      */ 0,
+        /* tp_dictoffset     */ 0,
+        /* tp_init           */ 0,
+        /* tp_alloc          */ 0,
+        /* tp_new            */ tokenizer_Token_new,
 }; // }}}
 // }}}

-static PyObject *COMPILED_TOKEN_REGEXPS = NULL, *UNICODE_UNESCAPE = NULL, *NEWLINE_UNESCAPE = NULL, *SIMPLE_UNESCAPE = NULL, *FIND_NEWLINES = NULL, *TOKEN_DISPATCH = NULL; 
+static PyObject *COMPILED_TOKEN_REGEXPS = NULL, *UNICODE_UNESCAPE = NULL, *NEWLINE_UNESCAPE = NULL, *SIMPLE_UNESCAPE = NULL, *FIND_NEWLINES = NULL, *TOKEN_DISPATCH = NULL;
 static PyObject *COLON = NULL, *SCOLON = NULL, *LPAR = NULL, *RPAR = NULL, *LBRACE = NULL, *RBRACE = NULL, *LBOX = NULL, *RBOX = NULL, *DELIM_TOK = NULL, *INTEGER = NULL, *STRING_TOK = NULL;

 static Py_ssize_t BAD_COMMENT, BAD_STRING, PERCENTAGE, DIMENSION, ATKEYWORD, FUNCTION, COMMENT, NUMBER, STRING, IDENT, HASH, URI, DELIM = -1;
- 
-#define CLEANUP(x) Py_XDECREF((x)); x = NULL; 
+
+#define CLEANUP(x) Py_XDECREF((x)); x = NULL;

 static PyObject*
 tokenize_cleanup(PyObject *self, PyObject *args) {
-    CLEANUP(COMPILED_TOKEN_REGEXPS); CLEANUP(UNICODE_UNESCAPE); CLEANUP(NEWLINE_UNESCAPE); CLEANUP(SIMPLE_UNESCAPE); CLEANUP(FIND_NEWLINES); CLEANUP(TOKEN_DISPATCH); 
+    CLEANUP(COMPILED_TOKEN_REGEXPS); CLEANUP(UNICODE_UNESCAPE); CLEANUP(NEWLINE_UNESCAPE); CLEANUP(SIMPLE_UNESCAPE); CLEANUP(FIND_NEWLINES); CLEANUP(TOKEN_DISPATCH);
    CLEANUP(COLON); CLEANUP(SCOLON); CLEANUP(LPAR); CLEANUP(RPAR); CLEANUP(LBRACE); CLEANUP(RBRACE); CLEANUP(LBOX); CLEANUP(RBOX); CLEANUP(DELIM_TOK); CLEANUP(INTEGER); CLEANUP(STRING_TOK);
    Py_RETURN_NONE;
 }
@ -170,67 +179,103 @@ tokenize_init(PyObject *self, PyObject *args) {
    Py_INCREF(COMPILED_TOKEN_REGEXPS); Py_INCREF(UNICODE_UNESCAPE); Py_INCREF(NEWLINE_UNESCAPE); Py_INCREF(SIMPLE_UNESCAPE); Py_INCREF(FIND_NEWLINES); Py_INCREF(TOKEN_DISPATCH);
    Py_INCREF(COLON); Py_INCREF(SCOLON); Py_INCREF(LPAR); Py_INCREF(RPAR); Py_INCREF(LBRACE); Py_INCREF(RBRACE); Py_INCREF(LBOX); Py_INCREF(RBOX); Py_INCREF(DELIM_TOK); Py_INCREF(INTEGER); Py_INCREF(STRING_TOK);

-#define SETCONST(x) x = PyInt_AsSsize_t(PyDict_GetItemString(cti, #x))
+#define SETCONST(x) do { (x) = PyNumber_AsSsize_t(PyDict_GetItemString(cti, #x), PyExc_OverflowError); \
+                         if((x) == -1 && PyErr_Occurred() != NULL) { return NULL; } \
+                       } while(0)
    SETCONST(BAD_COMMENT); SETCONST(BAD_STRING); SETCONST(PERCENTAGE); SETCONST(DIMENSION); SETCONST(ATKEYWORD); SETCONST(FUNCTION); SETCONST(COMMENT); SETCONST(NUMBER); SETCONST(STRING); SETCONST(IDENT); SETCONST(HASH); SETCONST(URI);

    Py_RETURN_NONE;
 }

-static int
-contains_char(PyObject *haystack, Py_UNICODE c) {
-#if PY_VERSION_HEX >= 0x03030000 
-#error Not implemented for python >= 3.3
+#if PY_VERSION_HEX >= 0x03030000
+#define ITER_CODE_PTS(unicode_object) { \
+    int _kind = PyUnicode_KIND(unicode_object); \
+    void *_data = PyUnicode_DATA(unicode_object); \
+    for (Py_ssize_t iteridx = 0; iteridx < PyUnicode_GET_LENGTH(unicode_object); iteridx++) { \
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+#else
+#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
+#define ITER_CODE_PTS(unicode_object) { \
+    Py_UNICODE *_data = PyUnicode_AS_UNICODE(unicode_object); \
+    for (Py_ssize_t iteridx = 0; iteridx < PyUnicode_GET_LENGTH(unicode_object); iteridx++) { \
+        Py_UNICODE ch = _data[iteridx];
 #endif
-    Py_ssize_t i = 0;
-    Py_UNICODE *data = PyUnicode_AS_UNICODE(haystack);
-    for (i = 0; i < PyUnicode_GET_SIZE(haystack); i++) {
-        if (data[i] == c) return 1;
-    }
+
+#define END_ITER_CODE_PTS }}
+
+static int
+contains_char(PyObject *haystack, const char c) {
+    ITER_CODE_PTS(haystack)
+        if (ch == c) return 1;
+    END_ITER_CODE_PTS
    return 0;
 }

 static PyObject *unicode_to_number(PyObject *src) {
-#if PY_VERSION_HEX >= 0x03030000 
-#error Not implemented for python >= 3.3
-#endif
    PyObject *raw = NULL, *ans = NULL;
    raw = PyUnicode_AsASCIIString(src);
    if (raw == NULL) { return NULL; }
    if (contains_char(src, '.')) {
+#if PY_MAJOR_VERSION >= 3
+        ans = PyFloat_FromString(raw);
+#else
        ans = PyFloat_FromString(raw, NULL);
+#endif
    } else {
+#if PY_MAJOR_VERSION >= 3
+        ans = PyLong_FromUnicodeObject(raw, 10);
+#else
        ans = PyInt_FromString(PyString_AS_STRING(raw), NULL, 10);
+#endif
    }
    Py_DECREF(raw);
    return ans;
 }

+
 static void lowercase(PyObject *x) {
-#if PY_VERSION_HEX >= 0x03030000 
-#error Not implemented for python >= 3.3
+    ITER_CODE_PTS(x)
+        if ('A' <= ch && ch <= 'Z') {
+#if PY_VERSION_HEX >= 0x03030000
+            PyUnicode_WRITE(_kind, _data, iteridx, ch + 32);
+#else
+            _data[iteridx] += 32;
 #endif
-    Py_ssize_t i = 0;
-    Py_UNICODE *data = PyUnicode_AS_UNICODE(x);
-    for (i = 0; i < PyUnicode_GET_SIZE(x); i++) 
-        data[i] = Py_UNICODE_TOLOWER(data[i]);
+        }
+    END_ITER_CODE_PTS
 }

-static PyObject* clone_unicode(Py_UNICODE *x, Py_ssize_t sz) {
-#if PY_VERSION_HEX >= 0x03030000 
-#error Not implemented for python >= 3.3
+static PyObject*
+clone_unicode(const PyObject* src, Py_ssize_t start_offset, Py_ssize_t end_offset) {
+#if PY_VERSION_HEX >= 0x03030000
+    int kind = PyUnicode_KIND(src);
+    void *data;
+    switch(kind) {
+        case PyUnicode_1BYTE_KIND:
+            data = PyUnicode_1BYTE_DATA(src) + start_offset; break;
+        case PyUnicode_2BYTE_KIND:
+            data = PyUnicode_2BYTE_DATA(src) + start_offset; break;
+        case PyUnicode_4BYTE_KIND:
+            data = PyUnicode_4BYTE_DATA(src) + start_offset; break;
+
+    }
+    return PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(src) - start_offset - end_offset)
+#else
+    return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(src) + start_offset, PyUnicode_GET_LENGTH(src) - start_offset - end_offset);
 #endif
-    PyObject *ans = PyUnicode_FromUnicode(NULL, sz);
-    if (ans == NULL) return PyErr_NoMemory();
-    memcpy(PyUnicode_AS_UNICODE(ans), x, sz * sizeof(Py_UNICODE));
-    return ans;
 }

 static PyObject*
 tokenize_flat(PyObject *self, PyObject *args) {
-#if PY_VERSION_HEX >= 0x03030000 
-#error Not implemented for python >= 3.3
-#endif
+#if PY_VERSION_HEX >= 0x03030000
+    void *css_source = NULL; int css_kind; Py_UCS4 c = 0, codepoint = 0;
+#define first_char(string) PyUnicode_READ_CHAR(string, 0)
+#define unicode_from_data(data, sz) PyUnicode_FromKindAndData(css_kind, data, sz)
+#else
    Py_UNICODE *css_source = NULL, c = 0, codepoint = 0;
+#define first_char(string) PyUnicode_AS_UNICODE(string)[0]
+#define unicode_from_data(data, sz) PyUnicode_FromUnicode(data, sz)
+#endif
    PyObject *ic = NULL, *token = NULL, *tokens = NULL, *type_name = NULL, *css_value = NULL, *value = NULL, *unit = NULL, *tries = NULL, *match = NULL, *match_func = NULL, *py_source = NULL, *item = NULL, *newlines = NULL;
    int ignore_comments = 0;
    Py_ssize_t pos = 0, line = 1, column = 1, i = 0;
@ -243,8 +288,13 @@ tokenize_flat(PyObject *self, PyObject *args) {

    if (!PyArg_ParseTuple(args, "UO", &py_source, &ic)) return NULL;
    if (PyObject_IsTrue(ic)) ignore_comments = 1;
-    source_len = PyUnicode_GET_SIZE(py_source);
+    source_len = PyUnicode_GET_LENGTH(py_source);
+#if PY_VERSION_HEX >= 0x03030000
+    if (PyUnicode_READY(py_source) != 0) return NULL;
+    css_source = PyUnicode_DATA(py_source); css_kind = PyUnicode_KIND(py_source);
+#else
    css_source = PyUnicode_AS_UNICODE(py_source);
+#endif

    tokens = PyList_New(0);
    if (tokens == NULL) return PyErr_NoMemory();
@ -256,39 +306,44 @@ tokenize_flat(PyObject *self, PyObject *args) {
 #define SINGLE(x) { type_ = -1; type_name = x; Py_INCREF(type_name); css_value = x; Py_INCREF(css_value); }

    while (pos < source_len) {
+#if PY_VERSION_HEX >= 0x03030000
+        c = PyUnicode_READ(css_kind, css_data, pos);
+#else
        c = css_source[pos];
+#endif

        css_value = NULL; type_name = NULL; value = NULL; unit = NULL; match = NULL;

-        if (c == ':') SINGLE(COLON) else if (c == ';') SINGLE(SCOLON) else if (c == '(') SINGLE(LPAR) else if (c == ')') SINGLE(RPAR) else if (c == '{') SINGLE(LBRACE) else if (c == '}') SINGLE(RBRACE) else if (c == '[') SINGLE(LBOX) else if (c == ']') SINGLE(RBOX) else 
+        if (c == ':') SINGLE(COLON) else if (c == ';') SINGLE(SCOLON) else if (c == '(') SINGLE(LPAR) else if (c == ')') SINGLE(RPAR) else if (c == '{') SINGLE(LBRACE) else if (c == '}') SINGLE(RBRACE) else if (c == '[') SINGLE(LBOX) else if (c == ']') SINGLE(RBOX) else
        {
            codepoint = (c > 160) ? 160: c;
            tries = PyList_GET_ITEM(TOKEN_DISPATCH, codepoint);
            for (i = 0; i < PyList_Size(tries); i++) {
                item = PyList_GET_ITEM(tries, i);
                match_func = PyTuple_GET_ITEM(item, 2);
-                match = PyObject_CallFunction(match_func, "On", py_source, pos); 
+                match = PyObject_CallFunction(match_func, "On", py_source, pos);
                if (match == NULL) { goto error; }
                if (match != Py_None) {
                    css_value = PyObject_CallMethod(match, "group", NULL);
                    if (css_value == NULL) { goto error; }
-                    type_ = PyInt_AsSsize_t(PyTuple_GET_ITEM(item, 0));
+                    type_ = PyNumber_AsSsize_t(PyTuple_GET_ITEM(item, 0), PyExc_OverflowError);
+                    if(type_ == -1 && PyErr_Occurred() != NULL) { goto error; }
                    type_name = PyTuple_GET_ITEM(item, 1);
                    Py_INCREF(type_name);
                    break;
                }
            }
            if (css_value == NULL) {  // No match
-                type_ = DELIM; type_name = DELIM_TOK; Py_INCREF(type_name); css_value = clone_unicode(&c, 1);
+                type_ = DELIM; type_name = DELIM_TOK; Py_INCREF(type_name); css_value = unicode_from_data(&c, 1);
                if (css_value == NULL) { goto error; }
            }
        }

-        length = PyUnicode_GET_SIZE(css_value);
+        length = PyUnicode_GET_LENGTH(css_value);
        next_pos = pos + length;

        // Now calculate the value and unit for this token (if any)
-        if (! (ignore_comments && (type_ == COMMENT || type_ == BAD_COMMENT))) { 
+        if (! (ignore_comments && (type_ == COMMENT || type_ == BAD_COMMENT))) {
            if (type_ == DIMENSION) {
                value = PyObject_CallMethod(match, "group", "I", 1);
                if (value == NULL) { goto error; }
@ -298,11 +353,11 @@ tokenize_flat(PyObject *self, PyObject *args) {
                UNESCAPE(unit, SIMPLE_UNESCAPE);
                UNESCAPE(unit, UNICODE_UNESCAPE);
                lowercase(unit);
-            } else 
+            } else

            if (type_ == PERCENTAGE) {
-                if (PyUnicode_GET_SIZE(css_value) > 0) {
-                    value = clone_unicode(PyUnicode_AS_UNICODE(css_value), PyUnicode_GET_SIZE(css_value) - 1);
+                if (PyUnicode_GET_LENGTH(css_value) > 0) {
+                    value = clone_unicode(css_value, 0, 1);
                    if (value == NULL) goto error;
                } else { value = css_value; Py_INCREF(value); }
                if (value == NULL) goto error;
@ -330,8 +385,8 @@ tokenize_flat(PyObject *self, PyObject *args) {
            if (type_ == URI) {
                value = PyObject_CallMethod(match, "group", "I", 1);
                if (value == NULL) { goto error; }
-                if (PyObject_IsTrue(value) && PyUnicode_GET_SIZE(value) > 1 && (PyUnicode_AS_UNICODE(value)[0] == '"' || PyUnicode_AS_UNICODE(value)[0] == '\'')) {
-                    item = clone_unicode(PyUnicode_AS_UNICODE(value) + 1, PyUnicode_GET_SIZE(value) - 2);
+                if (PyObject_IsTrue(value) && PyUnicode_GET_LENGTH(value) > 1 && (first_char(value) == '"' || first_char(value) == '\'')) {
+                    item = clone_unicode(value, 1, 1);
                    if (item == NULL) goto error;
                    Py_DECREF(value); value = item; item = NULL;
                    UNESCAPE(value, NEWLINE_UNESCAPE);
@ -341,8 +396,8 @@ tokenize_flat(PyObject *self, PyObject *args) {
            } else

            if (type_ == STRING) {
-                if (PyObject_IsTrue(css_value) && PyUnicode_GET_SIZE(css_value) > 1) {  // remove quotes
-                    value = clone_unicode(PyUnicode_AS_UNICODE(css_value) + 1, PyUnicode_GET_SIZE(css_value) - 2);
+                if (PyObject_IsTrue(css_value) && PyUnicode_GET_LENGTH(css_value) > 1) {  // remove quotes
+                    value = clone_unicode(css_value, 1, 1);
                } else {
                    value = css_value; Py_INCREF(value);
                }
@ -353,8 +408,8 @@ tokenize_flat(PyObject *self, PyObject *args) {

            if (type_ == BAD_STRING && next_pos == source_len) {
                Py_XDECREF(type_name); type_name = STRING_TOK; Py_INCREF(type_name);
-                if (PyObject_IsTrue(css_value) && PyUnicode_GET_SIZE(css_value) > 0) {  // remove quote
-                    value = clone_unicode(PyUnicode_AS_UNICODE(css_value) + 1, PyUnicode_GET_SIZE(css_value) - 1);
+                if (PyObject_IsTrue(css_value) && PyUnicode_GET_LENGTH(css_value) > 0) {  // remove quote
+                    value = clone_unicode(css_value, 1, 0);
                } else {
                    value = css_value; Py_INCREF(value);
                }
@ -386,8 +441,10 @@ tokenize_flat(PyObject *self, PyObject *args) {
            line += PyList_Size(newlines);
            item = PyObject_CallMethod(PyList_GET_ITEM(newlines, PyList_Size(newlines) - 1), "end", NULL);
            if (item == NULL) { Py_DECREF(newlines); newlines = NULL; goto error; }
-            column = length - PyInt_AsSsize_t(item) + 1;
-            Py_DECREF(item); item = NULL; 
+            column = PyNumber_AsSsize_t(item, PyExc_OverflowError);
+            if(column == -1 && PyErr_Occurred()) { Py_DECREF(newlines); newlines = NULL; goto error; }
+            column = length - column + 1;
+            Py_DECREF(item); item = NULL;
        } else column += length;
        Py_DECREF(newlines); newlines = NULL;

@ -397,6 +454,8 @@ tokenize_flat(PyObject *self, PyObject *args) {
 error:
    Py_XDECREF(tokens); Py_XDECREF(css_value); Py_XDECREF(type_name); Py_XDECREF(value); Py_XDECREF(unit); Py_XDECREF(match);
    return NULL;
+#undef unicode_from_data
+#undef first_char
 }

 static PyMethodDef tokenizer_methods[] = {
@ -415,17 +474,41 @@ static PyMethodDef tokenizer_methods[] = {
    {NULL, NULL, 0, NULL}
 };

+#if PY_MAJOR_VERSION >= 3
+#define INITERROR return NULL
+static struct PyModuleDef tokenizer_module = {
+        /* m_base     */ PyModuleDef_HEAD_INIT,
+        /* m_name     */ "tokenizer",
+        /* m_doc      */ "Implementation of tokenizer in C for speed.",
+        /* m_size     */ -1,
+        /* m_methods  */ tokenizer_methods,
+        /* m_slots    */ 0,
+        /* m_traverse */ 0,
+        /* m_clear    */ 0,
+        /* m_free     */ 0,
+};

-CALIBRE_MODINIT_FUNC
-inittokenizer(void) {
-    PyObject *m;
+CALIBRE_MODINIT_FUNC PyInit_tokenizer(void) {
    if (PyType_Ready(&tokenizer_TokenType) < 0)
-        return;
+        INITERROR;

-    m = Py_InitModule3("tokenizer", tokenizer_methods,
-    "Implementation of tokenizer in C for speed."
-    );
-    if (m == NULL) return;
+    PyObject *mod = PyModule_Create(&tokenizer_module);
+#else
+#define INITERROR return
+CALIBRE_MODINIT_FUNC inittokenizer(void) {
+    if (PyType_Ready(&tokenizer_TokenType) < 0)
+        INITERROR;
+
+    PyObject *mod = Py_InitModule3("tokenizer", tokenizer_methods,
+        "Implementation of tokenizer in C for speed.");
+#endif
+
+    if (mod == NULL) INITERROR;
    Py_INCREF(&tokenizer_TokenType);
-    PyModule_AddObject(m, "Token", (PyObject *)&tokenizer_TokenType);
+    PyModule_AddObject(mod, "Token", (PyObject *) &tokenizer_TokenType);
+
+
+#if PY_MAJOR_VERSION >= 3
+    return mod;
+#endif
 }