From e33c18459a9c990cab71e7ed53adcc9fc3c64e3d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 29 Oct 2019 12:50:26 +0530 Subject: [PATCH] Finish fast serialization of html to json --- src/calibre/srv/html_as_json.cpp | 354 +++++++++++++++++++++++++++++-- src/calibre/srv/render_book.py | 48 ++++- src/calibre/srv/tests/content.py | 21 ++ 3 files changed, 400 insertions(+), 23 deletions(-) diff --git a/src/calibre/srv/html_as_json.cpp b/src/calibre/srv/html_as_json.cpp index 7187cffe4e..ba555b323e 100644 --- a/src/calibre/srv/html_as_json.cpp +++ b/src/calibre/srv/html_as_json.cpp @@ -7,15 +7,28 @@ #include #include +#include +#include +#include +#include typedef struct { PyObject_HEAD /* Type-specific fields go here. */ PyObject *buf; size_t used; + std::vector *nsmap; } Serializer; +static void +dealloc(Serializer* self) +{ + Py_CLEAR(self->buf); + if (self->nsmap) delete self->nsmap; + Py_TYPE(self)->tp_free((PyObject*)self); +} + static PyObject * alloc(PyTypeObject *type, PyObject *args, PyObject *kwds) { @@ -25,31 +38,188 @@ alloc(PyTypeObject *type, PyObject *args, PyObject *kwds) if (self != NULL) { self->used = 0; self->buf = NULL; + self->nsmap = new (std::nothrow) std::vector(); + if (!self->nsmap) { PyErr_NoMemory(); dealloc(self); self = NULL; } } return (PyObject *)self; } -static void -dealloc(Serializer* self) -{ - Py_CLEAR(self->buf); - Py_TYPE(self)->tp_free((PyObject*)self); +static inline bool +ensure_space(Serializer *self, size_t amt) { + size_t required = amt + self->used; + if (!self->buf) { + self->buf = PyBytes_FromStringAndSize(NULL, std::max(required, static_cast(128u * 1024u))); + if (!self->buf) return false; + return true; + } + + if (required > static_cast(PyBytes_GET_SIZE(self->buf))) { + if (_PyBytes_Resize(&(self->buf), std::max(required, static_cast(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false; + } + return true; +} + +static bool +write_data(Serializer *self, const char *data, size_t sz) { + if (!ensure_space(self, sz)) return false; + memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz); + self->used += sz; + return true; +} + +#define write_str_literal(self, x) write_data(self, x, sizeof(x)-1) + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static inline void +utf8_decode_(uint32_t* state, uint32_t* codep, uint8_t byte) { + /* Comes from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * Copyright (c) 2008-2009 Bjoern Hoehrmann + * Used under license: https://opensource.org/licenses/MIT + */ + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; +} + +static inline unsigned +utf8_read_char(const char *s, uint32_t *codep) { + unsigned len = 0; + uint32_t state = UTF8_ACCEPT; + while(true) { + utf8_decode_(&state, codep, s[len++]); + if (state == UTF8_ACCEPT) break; + else if (state == UTF8_REJECT) { return 0; } + } + return len; +} + +static inline void +to_surrogate_pair(uint32_t unicode, uint16_t *uc, uint16_t *lc) { + uint32_t n = unicode - 0x10000; + *uc = ((n >> 10) & 0x3FF) | 0xD800; + *lc = (n & 0x3FF) | 0xDC00; +} + +static inline unsigned +write_hex16(char *out, uint16_t val) { + static const char *hex = "0123456789ABCDEF"; + + *out++ = hex[(val >> 12) & 0xF]; + *out++ = hex[(val >> 8) & 0xF]; + *out++ = hex[(val >> 4) & 0xF]; + *out++ = hex[ val & 0xF]; + + return 4; } static bool -write_data(Serializer *self, const char *data, size_t sz) { - if (!self->buf) { - self->buf = PyBytes_FromStringAndSize(NULL, std::max(sz, static_cast(128u * 1024u))); - if (!self->buf) return false; +write_string_as_json(Serializer *self, const char *str) +{ + const char *s = str; + if (!ensure_space(self, 32)) return false; + char *b = PyBytes_AS_STRING(self->buf) + self->used; + + *b++ = '"'; + while (*s != 0) { + unsigned char c = *s++; + + /* Encode the next character, and write it to b. */ + switch (c) { + case '"': + *b++ = '\\'; + *b++ = '"'; + break; + case '\\': + *b++ = '\\'; + *b++ = '\\'; + break; + case '\b': + *b++ = '\\'; + *b++ = 'b'; + break; + case '\f': + *b++ = '\\'; + *b++ = 'f'; + break; + case '\n': + *b++ = '\\'; + *b++ = 'n'; + break; + case '\r': + *b++ = '\\'; + *b++ = 'r'; + break; + case '\t': + *b++ = '\\'; + *b++ = 't'; + break; + default: { + s--; + uint32_t unicode; + unsigned len = utf8_read_char(s, &unicode); + if (len == 0) s++; + else if (c < 0x1F) { + /* Encode using \u.... */ + s += len; + if (unicode <= 0xFFFF) { + *b++ = '\\'; + *b++ = 'u'; + b += write_hex16(b, unicode); + } else { + /* Produce a surrogate pair. */ + uint16_t uc, lc; + to_surrogate_pair(unicode, &uc, &lc); + *b++ = '\\'; + *b++ = 'u'; + b += write_hex16(b, uc); + *b++ = '\\'; + *b++ = 'u'; + b += write_hex16(b, lc); + } + } else { + /* Write the character directly. */ + while (len-- > 0) *b++ = *s++; + } + + break; + } + } + + /* + * Update self to know about the new bytes, + * and set up b to write another encoded character. + */ + self->used = b - PyBytes_AS_STRING(self->buf); + if (!ensure_space(self, 32)) return false; + b = PyBytes_AS_STRING(self->buf) + self->used; } - size_t new_used = self->used + sz; - if (new_used > static_cast(PyBytes_GET_SIZE(self->buf))) { - if (_PyBytes_Resize(&(self->buf), std::max(new_used, static_cast(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false; - } - memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz); - self->used = new_used; + *b++ = '"'; + self->used = b - PyBytes_AS_STRING(self->buf); return true; } @@ -84,6 +254,145 @@ pywrite(Serializer *self, PyObject *arg) { Py_RETURN_NONE; } +static inline bool +namespaces_are_equal(const char *a, const char *b, size_t len) { + for (size_t i = 0; i < len; i++) { + if (a[i] != b[i]) return false; + if (!b[i]) return true; + } + return true; +} + +static inline int +namespace_index(Serializer *self, const char *ns, size_t nslen) { + for (size_t i = 0; i < self->nsmap->size(); i++) { + if (namespaces_are_equal((*self->nsmap)[i].c_str(), ns, nslen)) return i; + } + self->nsmap->push_back(std::string(ns, nslen)); + return self->nsmap->size() - 1; +} + +static bool +write_attr(Serializer *self, PyObject *args) { + const char *attr, *val; +#if PY_MAJOR_VERSION > 2 + if (!PyArg_ParseTuple(args, "ss", &attr, &val)) return false; +#else + if (!PyArg_ParseTuple(args, "eses", "UTF-8", &attr, "UTF-8", &val)) return false; +#endif + const char *b = strrchr(attr, '}'); + const char *attr_name = attr; + int nsindex = -1; + if (b) { + nsindex = namespace_index(self, attr + 1, b - attr - 1); + attr_name = b + 1; + } + if (!write_str_literal(self, "[")) goto end; + if (!write_string_as_json(self, attr_name)) goto end; + if (!write_str_literal(self, ",")) goto end; + if (!write_string_as_json(self, val)) goto end; + if (nsindex > -1) { + char buf[32]; + write_data(self, buf, snprintf(buf, sizeof(buf), ",%d", nsindex)); + } + if (!write_str_literal(self, "]")) goto end; + +end: +#if PY_MAJOR_VERSION < 3 + PyMem_Free(attr); PyMem_Free(val); +#endif + return PyErr_Occurred() ? false : true; +} + +static PyObject* +start_tag(Serializer *self, PyObject *args) { + const char *tag, *text, *tail; + PyObject *items; +#if PY_MAJOR_VERSION > 2 + if (!PyArg_ParseTuple(args, "zzzO!", &tag, &text, &tail, &PyList_Type, &items)) return NULL; +#else + if (!PyArg_ParseTuple(args, "etetetO!", "UTF-8", &tag, "UTF-8", &text, "UTF-8", &tail, &PyList_Type, &items)) return NULL; +#endif + Py_ssize_t num_attrs = PyList_Size(items); + const char *b = strrchr(tag, '}'); + const char *tag_name = tag; + int nsindex = -1; + if (b) { + nsindex = namespace_index(self, tag + 1, b - tag - 1); + tag_name = b + 1; + } + if (!write_str_literal(self, "{\"n\":")) goto end; + if (!write_string_as_json(self, tag_name)) goto end; + if (nsindex > -1) { + char buf[32]; + write_data(self, buf, snprintf(buf, sizeof(buf), ",\"s\":%d", nsindex)); + } + if (text) { + if (!write_str_literal(self, ",\"x\":")) goto end; + if (!write_string_as_json(self, text)) goto end; + } + if (tail) { + if (!write_str_literal(self, ",\"l\":")) goto end; + if (!write_string_as_json(self, tail)) goto end; + } + if (num_attrs > 0) { + if (!write_str_literal(self, ",\"a\":[")) goto end; + for (Py_ssize_t i = 0; i < num_attrs; i++) { + if (i) { if (!write_str_literal(self, ",")) goto end; } + if (!write_attr(self, PyList_GET_ITEM(items, i))) goto end; + } + if (!write_str_literal(self, "]")) goto end; + } + +end: +#if PY_MAJOR_VERSION < 3 + PyMem_Free(tag); PyMem_Free(text); PyMem_Free(tail); +#endif + if (PyErr_Occurred()) return NULL; + Py_RETURN_NONE; +} + +static PyObject* +add_comment(Serializer *self, PyObject *args) { + const char *text, *tail, *type; +#if PY_MAJOR_VERSION > 2 + if (!PyArg_ParseTuple(args, "zzs", &text, &tail, &type)) return NULL; +#else + if (!PyArg_ParseTuple(args, "etets", "UTF-8", &text, "UTF-8", &tail, &type)) return NULL; +#endif + if (!write_str_literal(self, "{\"s\":")) goto end; + if (!write_string_as_json(self, type)) goto end; + if (text) { + if (!write_str_literal(self, ",\"x\":")) goto end; + if (!write_string_as_json(self, text)) goto end; + } + if (tail) { + if (!write_str_literal(self, ",\"l\":")) goto end; + if (!write_string_as_json(self, tail)) goto end; + } + if (!write_str_literal(self, "}")) goto end; +end: +#if PY_MAJOR_VERSION < 3 + PyMem_Free(text); PyMem_Free(tail); +#endif + if (PyErr_Occurred()) return NULL; + Py_RETURN_NONE; +} + +static PyObject* +add_nsmap(Serializer *self, PyObject *args) { + (void)args; + if (!write_str_literal(self, "[")) return NULL; + bool is_first = true; + for (auto x : *self->nsmap) { + if (is_first) is_first = false; + else if (!write_str_literal(self, ",")) return NULL; + if (!write_string_as_json(self, x.c_str())) return NULL; + } + if (!write_str_literal(self, "]")) return NULL; + Py_RETURN_NONE; +} + static PyObject* done(Serializer *self, PyObject *arg) { (void)arg; @@ -92,15 +401,24 @@ done(Serializer *self, PyObject *arg) { PyObject *ans = self->buf; self->buf = NULL; self->used = 0; + self->nsmap->clear(); return ans; } -// Type definition {{{ - +// Boilerplate {{{ static PyMethodDef Serializer_methods[] = { + {"start_tag", (PyCFunction)start_tag, METH_VARARGS, + "Start serializing a tag" + }, + {"add_comment", (PyCFunction)add_comment, METH_VARARGS, + "Add a comment" + }, {"write", (PyCFunction)pywrite, METH_O, "Write the specified unicode or bytes object" }, + {"add_nsmap", (PyCFunction)add_nsmap, METH_NOARGS, + "Add the namespace map" + }, {"done", (PyCFunction)done, METH_NOARGS, "Get the serialized output" }, @@ -147,7 +465,6 @@ PyTypeObject SerializerType = { /* tp_alloc */ 0, /* tp_new */ alloc, }; -// }}} static char doc[] = "Serialize HTML as JSON efficiently"; static PyMethodDef methods[] = { @@ -194,3 +511,4 @@ CALIBRE_MODINIT_FUNC inithtml_as_json(void) { return m; #endif } +// }}} diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py index 40f5285d06..f17ffe087b 100644 --- a/src/calibre/srv/render_book.py +++ b/src/calibre/srv/render_book.py @@ -15,6 +15,7 @@ from datetime import datetime from functools import partial from itertools import count from math import ceil +from lxml.etree import Comment from css_parser import replaceUrls from css_parser.css import CSSRule @@ -243,10 +244,6 @@ def toc_anchor_map(toc): return dict(ans) -def serialize_parsed_html(root): - return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':'))) - - class SimpleContainer(ContainerBase): tweak_mode = True @@ -416,7 +413,7 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map, link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) - shtml = serialize_parsed_html(root) + shtml = html_as_json(root) with container.open(name, 'wb') as f: f.write(shtml) @@ -757,6 +754,47 @@ def ensure_body(root): body.append(div) +def html_as_json(root): + try: + Serializer = plugins['html_as_json'][0].Serializer + except KeyError: + return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':'))) + s = Serializer() + s.write(b'{"version":1,"tree":') + stack = [root] + + while stack: + elem = stack.pop() + if isinstance(elem, bytes): + s.write(elem) + continue + tag = getattr(elem, 'tag', html_as_json) + if callable(tag): + if tag is Comment: + s.add_comment(elem.text, elem.tail, 'c') + else: + tail = getattr(elem, 'tail', None) + if tail: + s.add_comment(None, tail, 'o') + continue + s.start_tag(elem.tag, elem.text, elem.tail, elem.items()) + children = tuple(elem.iterchildren()) + if children: + s.write(b',"c":[') + stack.append(b']}') + first_child = children[0] + for c in reversed(children): + stack.append(c) + if c is not first_child: + stack.append(b',') + else: + s.write(b'}') + s.write(b',"nsmap":') + s.add_nsmap() + s.write(b'}') + return s.done() + + def html_as_dict(root): ensure_body(root) for child in tuple(root.iterchildren('*')): diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py index b3543f4903..9b0b24f20e 100644 --- a/src/calibre/srv/tests/content.py +++ b/src/calibre/srv/tests/content.py @@ -240,6 +240,8 @@ class ContentTest(LibraryBaseTest): def test_html_as_json(self): # {{{ from calibre.constants import plugins + from calibre.srv.render_book import html_as_json + from calibre.ebooks.oeb.parse_utils import html5_parse Serializer = plugins['html_as_json'][0].Serializer s = Serializer() d = 'a' * (127 * 1024) @@ -247,4 +249,23 @@ class ContentTest(LibraryBaseTest): d = d.encode('ascii') s.write(d) self.ae(s.done(), (d + d)) + + def t(html, body_children, nsmap=('http://www.w3.org/1999/xhtml',)): + root = html5_parse(html) + raw = html_as_json(root) + # print(raw.decode('utf-8')) + data = json.loads(raw) + self.ae(data['version'], 1) + self.ae(tuple(data['nsmap']), nsmap) + bc = data['tree']['c'][1]['c'] + self.ae(bc, body_children) + + t('

at

l', [{"n":"p","s":0,"x":"a","l":"l","c":[{"s":"c","x":"c","l":"t"}]}]) + t('

a', [{"n":"p","s":0,"x":"a","a":[['class','foo'],['id','bar']]}]) + t( + '', [{'n': 'svg', 's': 1, 'a': [['href', 'h', 2]]}], + ('http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1999/xlink') + ) + text = '🐈\n\t\\mūs"' + t("

PeƱa".format(text), [{"n":"p","s":0,"x":"PeƱa","a":[['id',text]]}]) # }}}