Finish fast serialization of html to json

This commit is contained in:
Kovid Goyal 2019-10-29 12:50:26 +05:30
parent 02838a712c
commit e33c18459a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 400 additions and 23 deletions

View File

@ -7,15 +7,28 @@
#include <Python.h> #include <Python.h>
#include <algorithm> #include <algorithm>
#include <cstdint>
#include <cstring>
#include <vector>
#include <string>
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
/* Type-specific fields go here. */ /* Type-specific fields go here. */
PyObject *buf; PyObject *buf;
size_t used; size_t used;
std::vector<std::string> *nsmap;
} Serializer; } Serializer;
static void
dealloc(Serializer* self)
{
Py_CLEAR(self->buf);
if (self->nsmap) delete self->nsmap;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject * static PyObject *
alloc(PyTypeObject *type, PyObject *args, PyObject *kwds) alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
{ {
@ -25,31 +38,188 @@ alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self != NULL) { if (self != NULL) {
self->used = 0; self->used = 0;
self->buf = NULL; self->buf = NULL;
self->nsmap = new (std::nothrow) std::vector<std::string>();
if (!self->nsmap) { PyErr_NoMemory(); dealloc(self); self = NULL; }
} }
return (PyObject *)self; return (PyObject *)self;
} }
static void static inline bool
dealloc(Serializer* self) ensure_space(Serializer *self, size_t amt) {
{ size_t required = amt + self->used;
Py_CLEAR(self->buf); if (!self->buf) {
Py_TYPE(self)->tp_free((PyObject*)self); self->buf = PyBytes_FromStringAndSize(NULL, std::max(required, static_cast<size_t>(128u * 1024u)));
if (!self->buf) return false;
return true;
}
if (required > static_cast<size_t>(PyBytes_GET_SIZE(self->buf))) {
if (_PyBytes_Resize(&(self->buf), std::max(required, static_cast<size_t>(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false;
}
return true;
}
static bool
write_data(Serializer *self, const char *data, size_t sz) {
if (!ensure_space(self, sz)) return false;
memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
self->used += sz;
return true;
}
#define write_str_literal(self, x) write_data(self, x, sizeof(x)-1)
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
static inline void
utf8_decode_(uint32_t* state, uint32_t* codep, uint8_t byte) {
/* Comes from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
* Used under license: https://opensource.org/licenses/MIT
*/
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state*16 + type];
}
static inline unsigned
utf8_read_char(const char *s, uint32_t *codep) {
unsigned len = 0;
uint32_t state = UTF8_ACCEPT;
while(true) {
utf8_decode_(&state, codep, s[len++]);
if (state == UTF8_ACCEPT) break;
else if (state == UTF8_REJECT) { return 0; }
}
return len;
}
static inline void
to_surrogate_pair(uint32_t unicode, uint16_t *uc, uint16_t *lc) {
uint32_t n = unicode - 0x10000;
*uc = ((n >> 10) & 0x3FF) | 0xD800;
*lc = (n & 0x3FF) | 0xDC00;
}
static inline unsigned
write_hex16(char *out, uint16_t val) {
static const char *hex = "0123456789ABCDEF";
*out++ = hex[(val >> 12) & 0xF];
*out++ = hex[(val >> 8) & 0xF];
*out++ = hex[(val >> 4) & 0xF];
*out++ = hex[ val & 0xF];
return 4;
} }
static bool static bool
write_data(Serializer *self, const char *data, size_t sz) { write_string_as_json(Serializer *self, const char *str)
if (!self->buf) { {
self->buf = PyBytes_FromStringAndSize(NULL, std::max(sz, static_cast<size_t>(128u * 1024u))); const char *s = str;
if (!self->buf) return false; if (!ensure_space(self, 32)) return false;
char *b = PyBytes_AS_STRING(self->buf) + self->used;
*b++ = '"';
while (*s != 0) {
unsigned char c = *s++;
/* Encode the next character, and write it to b. */
switch (c) {
case '"':
*b++ = '\\';
*b++ = '"';
break;
case '\\':
*b++ = '\\';
*b++ = '\\';
break;
case '\b':
*b++ = '\\';
*b++ = 'b';
break;
case '\f':
*b++ = '\\';
*b++ = 'f';
break;
case '\n':
*b++ = '\\';
*b++ = 'n';
break;
case '\r':
*b++ = '\\';
*b++ = 'r';
break;
case '\t':
*b++ = '\\';
*b++ = 't';
break;
default: {
s--;
uint32_t unicode;
unsigned len = utf8_read_char(s, &unicode);
if (len == 0) s++;
else if (c < 0x1F) {
/* Encode using \u.... */
s += len;
if (unicode <= 0xFFFF) {
*b++ = '\\';
*b++ = 'u';
b += write_hex16(b, unicode);
} else {
/* Produce a surrogate pair. */
uint16_t uc, lc;
to_surrogate_pair(unicode, &uc, &lc);
*b++ = '\\';
*b++ = 'u';
b += write_hex16(b, uc);
*b++ = '\\';
*b++ = 'u';
b += write_hex16(b, lc);
} }
size_t new_used = self->used + sz; } else {
if (new_used > static_cast<size_t>(PyBytes_GET_SIZE(self->buf))) { /* Write the character directly. */
if (_PyBytes_Resize(&(self->buf), std::max(new_used, static_cast<size_t>(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false; while (len-- > 0) *b++ = *s++;
} }
memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
self->used = new_used; break;
}
}
/*
* Update self to know about the new bytes,
* and set up b to write another encoded character.
*/
self->used = b - PyBytes_AS_STRING(self->buf);
if (!ensure_space(self, 32)) return false;
b = PyBytes_AS_STRING(self->buf) + self->used;
}
*b++ = '"';
self->used = b - PyBytes_AS_STRING(self->buf);
return true; return true;
} }
@ -84,6 +254,145 @@ pywrite(Serializer *self, PyObject *arg) {
Py_RETURN_NONE; Py_RETURN_NONE;
} }
static inline bool
namespaces_are_equal(const char *a, const char *b, size_t len) {
for (size_t i = 0; i < len; i++) {
if (a[i] != b[i]) return false;
if (!b[i]) return true;
}
return true;
}
static inline int
namespace_index(Serializer *self, const char *ns, size_t nslen) {
for (size_t i = 0; i < self->nsmap->size(); i++) {
if (namespaces_are_equal((*self->nsmap)[i].c_str(), ns, nslen)) return i;
}
self->nsmap->push_back(std::string(ns, nslen));
return self->nsmap->size() - 1;
}
static bool
write_attr(Serializer *self, PyObject *args) {
const char *attr, *val;
#if PY_MAJOR_VERSION > 2
if (!PyArg_ParseTuple(args, "ss", &attr, &val)) return false;
#else
if (!PyArg_ParseTuple(args, "eses", "UTF-8", &attr, "UTF-8", &val)) return false;
#endif
const char *b = strrchr(attr, '}');
const char *attr_name = attr;
int nsindex = -1;
if (b) {
nsindex = namespace_index(self, attr + 1, b - attr - 1);
attr_name = b + 1;
}
if (!write_str_literal(self, "[")) goto end;
if (!write_string_as_json(self, attr_name)) goto end;
if (!write_str_literal(self, ",")) goto end;
if (!write_string_as_json(self, val)) goto end;
if (nsindex > -1) {
char buf[32];
write_data(self, buf, snprintf(buf, sizeof(buf), ",%d", nsindex));
}
if (!write_str_literal(self, "]")) goto end;
end:
#if PY_MAJOR_VERSION < 3
PyMem_Free(attr); PyMem_Free(val);
#endif
return PyErr_Occurred() ? false : true;
}
static PyObject*
start_tag(Serializer *self, PyObject *args) {
const char *tag, *text, *tail;
PyObject *items;
#if PY_MAJOR_VERSION > 2
if (!PyArg_ParseTuple(args, "zzzO!", &tag, &text, &tail, &PyList_Type, &items)) return NULL;
#else
if (!PyArg_ParseTuple(args, "etetetO!", "UTF-8", &tag, "UTF-8", &text, "UTF-8", &tail, &PyList_Type, &items)) return NULL;
#endif
Py_ssize_t num_attrs = PyList_Size(items);
const char *b = strrchr(tag, '}');
const char *tag_name = tag;
int nsindex = -1;
if (b) {
nsindex = namespace_index(self, tag + 1, b - tag - 1);
tag_name = b + 1;
}
if (!write_str_literal(self, "{\"n\":")) goto end;
if (!write_string_as_json(self, tag_name)) goto end;
if (nsindex > -1) {
char buf[32];
write_data(self, buf, snprintf(buf, sizeof(buf), ",\"s\":%d", nsindex));
}
if (text) {
if (!write_str_literal(self, ",\"x\":")) goto end;
if (!write_string_as_json(self, text)) goto end;
}
if (tail) {
if (!write_str_literal(self, ",\"l\":")) goto end;
if (!write_string_as_json(self, tail)) goto end;
}
if (num_attrs > 0) {
if (!write_str_literal(self, ",\"a\":[")) goto end;
for (Py_ssize_t i = 0; i < num_attrs; i++) {
if (i) { if (!write_str_literal(self, ",")) goto end; }
if (!write_attr(self, PyList_GET_ITEM(items, i))) goto end;
}
if (!write_str_literal(self, "]")) goto end;
}
end:
#if PY_MAJOR_VERSION < 3
PyMem_Free(tag); PyMem_Free(text); PyMem_Free(tail);
#endif
if (PyErr_Occurred()) return NULL;
Py_RETURN_NONE;
}
static PyObject*
add_comment(Serializer *self, PyObject *args) {
const char *text, *tail, *type;
#if PY_MAJOR_VERSION > 2
if (!PyArg_ParseTuple(args, "zzs", &text, &tail, &type)) return NULL;
#else
if (!PyArg_ParseTuple(args, "etets", "UTF-8", &text, "UTF-8", &tail, &type)) return NULL;
#endif
if (!write_str_literal(self, "{\"s\":")) goto end;
if (!write_string_as_json(self, type)) goto end;
if (text) {
if (!write_str_literal(self, ",\"x\":")) goto end;
if (!write_string_as_json(self, text)) goto end;
}
if (tail) {
if (!write_str_literal(self, ",\"l\":")) goto end;
if (!write_string_as_json(self, tail)) goto end;
}
if (!write_str_literal(self, "}")) goto end;
end:
#if PY_MAJOR_VERSION < 3
PyMem_Free(text); PyMem_Free(tail);
#endif
if (PyErr_Occurred()) return NULL;
Py_RETURN_NONE;
}
static PyObject*
add_nsmap(Serializer *self, PyObject *args) {
(void)args;
if (!write_str_literal(self, "[")) return NULL;
bool is_first = true;
for (auto x : *self->nsmap) {
if (is_first) is_first = false;
else if (!write_str_literal(self, ",")) return NULL;
if (!write_string_as_json(self, x.c_str())) return NULL;
}
if (!write_str_literal(self, "]")) return NULL;
Py_RETURN_NONE;
}
static PyObject* static PyObject*
done(Serializer *self, PyObject *arg) { done(Serializer *self, PyObject *arg) {
(void)arg; (void)arg;
@ -92,15 +401,24 @@ done(Serializer *self, PyObject *arg) {
PyObject *ans = self->buf; PyObject *ans = self->buf;
self->buf = NULL; self->buf = NULL;
self->used = 0; self->used = 0;
self->nsmap->clear();
return ans; return ans;
} }
// Type definition {{{ // Boilerplate {{{
static PyMethodDef Serializer_methods[] = { static PyMethodDef Serializer_methods[] = {
{"start_tag", (PyCFunction)start_tag, METH_VARARGS,
"Start serializing a tag"
},
{"add_comment", (PyCFunction)add_comment, METH_VARARGS,
"Add a comment"
},
{"write", (PyCFunction)pywrite, METH_O, {"write", (PyCFunction)pywrite, METH_O,
"Write the specified unicode or bytes object" "Write the specified unicode or bytes object"
}, },
{"add_nsmap", (PyCFunction)add_nsmap, METH_NOARGS,
"Add the namespace map"
},
{"done", (PyCFunction)done, METH_NOARGS, {"done", (PyCFunction)done, METH_NOARGS,
"Get the serialized output" "Get the serialized output"
}, },
@ -147,7 +465,6 @@ PyTypeObject SerializerType = {
/* tp_alloc */ 0, /* tp_alloc */ 0,
/* tp_new */ alloc, /* tp_new */ alloc,
}; };
// }}}
static char doc[] = "Serialize HTML as JSON efficiently"; static char doc[] = "Serialize HTML as JSON efficiently";
static PyMethodDef methods[] = { static PyMethodDef methods[] = {
@ -194,3 +511,4 @@ CALIBRE_MODINIT_FUNC inithtml_as_json(void) {
return m; return m;
#endif #endif
} }
// }}}

View File

@ -15,6 +15,7 @@ from datetime import datetime
from functools import partial from functools import partial
from itertools import count from itertools import count
from math import ceil from math import ceil
from lxml.etree import Comment
from css_parser import replaceUrls from css_parser import replaceUrls
from css_parser.css import CSSRule from css_parser.css import CSSRule
@ -243,10 +244,6 @@ def toc_anchor_map(toc):
return dict(ans) return dict(ans)
def serialize_parsed_html(root):
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
class SimpleContainer(ContainerBase): class SimpleContainer(ContainerBase):
tweak_mode = True tweak_mode = True
@ -416,7 +413,7 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
shtml = serialize_parsed_html(root) shtml = html_as_json(root)
with container.open(name, 'wb') as f: with container.open(name, 'wb') as f:
f.write(shtml) f.write(shtml)
@ -757,6 +754,47 @@ def ensure_body(root):
body.append(div) body.append(div)
def html_as_json(root):
try:
Serializer = plugins['html_as_json'][0].Serializer
except KeyError:
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
s = Serializer()
s.write(b'{"version":1,"tree":')
stack = [root]
while stack:
elem = stack.pop()
if isinstance(elem, bytes):
s.write(elem)
continue
tag = getattr(elem, 'tag', html_as_json)
if callable(tag):
if tag is Comment:
s.add_comment(elem.text, elem.tail, 'c')
else:
tail = getattr(elem, 'tail', None)
if tail:
s.add_comment(None, tail, 'o')
continue
s.start_tag(elem.tag, elem.text, elem.tail, elem.items())
children = tuple(elem.iterchildren())
if children:
s.write(b',"c":[')
stack.append(b']}')
first_child = children[0]
for c in reversed(children):
stack.append(c)
if c is not first_child:
stack.append(b',')
else:
s.write(b'}')
s.write(b',"nsmap":')
s.add_nsmap()
s.write(b'}')
return s.done()
def html_as_dict(root): def html_as_dict(root):
ensure_body(root) ensure_body(root)
for child in tuple(root.iterchildren('*')): for child in tuple(root.iterchildren('*')):

View File

@ -240,6 +240,8 @@ class ContentTest(LibraryBaseTest):
def test_html_as_json(self): # {{{ def test_html_as_json(self): # {{{
from calibre.constants import plugins from calibre.constants import plugins
from calibre.srv.render_book import html_as_json
from calibre.ebooks.oeb.parse_utils import html5_parse
Serializer = plugins['html_as_json'][0].Serializer Serializer = plugins['html_as_json'][0].Serializer
s = Serializer() s = Serializer()
d = 'a' * (127 * 1024) d = 'a' * (127 * 1024)
@ -247,4 +249,23 @@ class ContentTest(LibraryBaseTest):
d = d.encode('ascii') d = d.encode('ascii')
s.write(d) s.write(d)
self.ae(s.done(), (d + d)) self.ae(s.done(), (d + d))
def t(html, body_children, nsmap=('http://www.w3.org/1999/xhtml',)):
root = html5_parse(html)
raw = html_as_json(root)
# print(raw.decode('utf-8'))
data = json.loads(raw)
self.ae(data['version'], 1)
self.ae(tuple(data['nsmap']), nsmap)
bc = data['tree']['c'][1]['c']
self.ae(bc, body_children)
t('<p>a<!--c-->t</p>l', [{"n":"p","s":0,"x":"a","l":"l","c":[{"s":"c","x":"c","l":"t"}]}])
t('<p class="foo" id="bar">a', [{"n":"p","s":0,"x":"a","a":[['class','foo'],['id','bar']]}])
t(
'<svg xlink:href="h"></svg>', [{'n': 'svg', 's': 1, 'a': [['href', 'h', 2]]}],
('http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1999/xlink')
)
text = '🐈\n\t\\mūs"'
t("<p id='{}'>Peña".format(text), [{"n":"p","s":0,"x":"Peña","a":[['id',text]]}])
# }}} # }}}