From fc7aee08df06af0420670c9c0b826831168a2408 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 28 Oct 2019 13:30:18 +0530 Subject: [PATCH] Start work on fast html to json --- setup/extensions.json | 5 + src/calibre/constants.py | 1 + src/calibre/srv/html_as_json.cpp | 196 +++++++++++++++++++++++++++++++ src/calibre/srv/tests/content.py | 11 ++ 4 files changed, 213 insertions(+) create mode 100644 src/calibre/srv/html_as_json.cpp diff --git a/setup/extensions.json b/setup/extensions.json index 6df1445dda..dc23e0f421 100644 --- a/setup/extensions.json +++ b/setup/extensions.json @@ -125,6 +125,11 @@ "error": "!podofo_error", "needs_c++11": true }, + { + "name": "html_as_json", + "sources": "calibre/srv/html_as_json.cpp", + "needs_c++11": true + }, { "name": "pictureflow", "sources": "calibre/gui2/pictureflow/pictureflow.cpp", diff --git a/src/calibre/constants.py b/src/calibre/constants.py index c93ef4f6f8..d5d049b732 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -172,6 +172,7 @@ class Plugins(collections.Mapping): 'chmlib', 'icu', 'speedup', + 'html_as_json', 'unicode_names', 'html', 'freetype', diff --git a/src/calibre/srv/html_as_json.cpp b/src/calibre/srv/html_as_json.cpp new file mode 100644 index 0000000000..7187cffe4e --- /dev/null +++ b/src/calibre/srv/html_as_json.cpp @@ -0,0 +1,196 @@ +/* + * html_as_json.cpp + * Copyright (C) 2019 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#include +#include + +typedef struct { + PyObject_HEAD + /* Type-specific fields go here. */ + PyObject *buf; + size_t used; +} Serializer; + + +static PyObject * +alloc(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + Serializer *self; + + self = (Serializer *)type->tp_alloc(type, 0); + if (self != NULL) { + self->used = 0; + self->buf = NULL; + } + return (PyObject *)self; +} + + +static void +dealloc(Serializer* self) +{ + Py_CLEAR(self->buf); + Py_TYPE(self)->tp_free((PyObject*)self); +} + + +static bool +write_data(Serializer *self, const char *data, size_t sz) { + if (!self->buf) { + self->buf = PyBytes_FromStringAndSize(NULL, std::max(sz, static_cast(128u * 1024u))); + if (!self->buf) return false; + } + size_t new_used = self->used + sz; + if (new_used > static_cast(PyBytes_GET_SIZE(self->buf))) { + if (_PyBytes_Resize(&(self->buf), std::max(new_used, static_cast(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false; + } + memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz); + self->used = new_used; + return true; +} + + +static PyObject* +pywrite(Serializer *self, PyObject *arg) { + const char *data; + size_t sz; + PyObject *temp = NULL; + if (PyBytes_Check(arg)) { + data = PyBytes_AS_STRING(arg); + sz = PyBytes_GET_SIZE(arg); + } else if (PyUnicode_Check(arg)) { +#if PY_MAJOR_VERSION > 2 + Py_ssize_t ssz; + data = PyUnicode_AsUTF8AndSize(arg, &ssz); + sz = ssz; + if (data == NULL) return NULL; +#else + temp = PyUnicode_AsUTF8String(arg); + if (temp == NULL) return NULL; + data = PyBytes_AS_STRING(temp); + sz = PyBytes_GET_SIZE(temp); +#endif + } else { + PyErr_SetString(PyExc_TypeError, "A unicode or bytes object expected"); + return NULL; + } + bool ok = write_data(self, data, sz); + Py_CLEAR(temp); + if (!ok) return NULL; + Py_RETURN_NONE; +} + +static PyObject* +done(Serializer *self, PyObject *arg) { + (void)arg; + if (!self->buf) return PyBytes_FromString(""); + if (_PyBytes_Resize(&self->buf, self->used) != 0) return NULL; + PyObject *ans = self->buf; + self->buf = NULL; + self->used = 0; + return ans; +} + +// Type definition {{{ + +static PyMethodDef Serializer_methods[] = { + {"write", (PyCFunction)pywrite, METH_O, + "Write the specified unicode or bytes object" + }, + {"done", (PyCFunction)done, METH_NOARGS, + "Get the serialized output" + }, + {NULL} /* Sentinel */ +}; + +PyTypeObject SerializerType = { + PyVarObject_HEAD_INIT(NULL, 0) + /* tp_name */ "html_as_json.Serializer", + /* tp_basicsize */ sizeof(Serializer), + /* tp_itemsize */ 0, + /* tp_dealloc */ (destructor)dealloc, + /* tp_print */ 0, + /* tp_getattr */ 0, + /* tp_setattr */ 0, + /* tp_compare */ 0, + /* tp_repr */ 0, + /* tp_as_number */ 0, + /* tp_as_sequence */ 0, + /* tp_as_mapping */ 0, + /* tp_hash */ 0, + /* tp_call */ 0, + /* tp_str */ 0, + /* tp_getattro */ 0, + /* tp_setattro */ 0, + /* tp_as_buffer */ 0, + /* tp_flags */ Py_TPFLAGS_DEFAULT, + /* tp_doc */ "Serializer", + /* tp_traverse */ 0, + /* tp_clear */ 0, + /* tp_richcompare */ 0, + /* tp_weaklistoffset */ 0, + /* tp_iter */ 0, + /* tp_iternext */ 0, + /* tp_methods */ Serializer_methods, + /* tp_members */ 0, + /* tp_getset */ 0, + /* tp_base */ 0, + /* tp_dict */ 0, + /* tp_descr_get */ 0, + /* tp_descr_set */ 0, + /* tp_dictoffset */ 0, + /* tp_init */ 0, + /* tp_alloc */ 0, + /* tp_new */ alloc, +}; +// }}} + +static char doc[] = "Serialize HTML as JSON efficiently"; +static PyMethodDef methods[] = { + {NULL} /* Sentinel */ +}; + +#if PY_MAJOR_VERSION >= 3 +#define INITERROR return NULL +#define INITMODULE PyModule_Create(&module) +static struct PyModuleDef module = { + /* m_base */ PyModuleDef_HEAD_INIT, + /* m_name */ "html_as_json", + /* m_doc */ doc, + /* m_size */ -1, + /* m_methods */ methods, + /* m_slots */ 0, + /* m_traverse */ 0, + /* m_clear */ 0, + /* m_free */ 0, +}; +CALIBRE_MODINIT_FUNC PyInit_html_as_json(void) { +#else +#define INITERROR return +#define INITMODULE Py_InitModule3("html_as_json", methods, doc) +CALIBRE_MODINIT_FUNC inithtml_as_json(void) { +#endif + + PyObject* m; + + if (PyType_Ready(&SerializerType) < 0) { + INITERROR; + } + + + m = INITMODULE; + if (m == NULL) { + INITERROR; + } + + PyModule_AddObject(m, "Serializer", (PyObject *)&SerializerType); + + +#if PY_MAJOR_VERSION >= 3 + return m; +#endif +} diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py index ccc51a4045..b3543f4903 100644 --- a/src/calibre/srv/tests/content.py +++ b/src/calibre/srv/tests/content.py @@ -237,3 +237,14 @@ class ContentTest(LibraryBaseTest): root = html5_parse('

m') self.ae(get_length(root), 1) # }}} + + def test_html_as_json(self): # {{{ + from calibre.constants import plugins + Serializer = plugins['html_as_json'][0].Serializer + s = Serializer() + d = 'a' * (127 * 1024) + s.write(d) + d = d.encode('ascii') + s.write(d) + self.ae(s.done(), (d + d)) + # }}}