(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false;
- }
- memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
- self->used = new_used;
+ *b++ = '"';
+ self->used = b - PyBytes_AS_STRING(self->buf);
return true;
}
@@ -84,6 +254,145 @@ pywrite(Serializer *self, PyObject *arg) {
Py_RETURN_NONE;
}
+static inline bool
+namespaces_are_equal(const char *a, const char *b, size_t len) {
+ for (size_t i = 0; i < len; i++) {
+ if (a[i] != b[i]) return false;
+ if (!b[i]) return true;
+ }
+ return true;
+}
+
+static inline int
+namespace_index(Serializer *self, const char *ns, size_t nslen) {
+ for (size_t i = 0; i < self->nsmap->size(); i++) {
+ if (namespaces_are_equal((*self->nsmap)[i].c_str(), ns, nslen)) return i;
+ }
+ self->nsmap->push_back(std::string(ns, nslen));
+ return self->nsmap->size() - 1;
+}
+
+static bool
+write_attr(Serializer *self, PyObject *args) {
+ const char *attr, *val;
+#if PY_MAJOR_VERSION > 2
+ if (!PyArg_ParseTuple(args, "ss", &attr, &val)) return false;
+#else
+ if (!PyArg_ParseTuple(args, "eses", "UTF-8", &attr, "UTF-8", &val)) return false;
+#endif
+ const char *b = strrchr(attr, '}');
+ const char *attr_name = attr;
+ int nsindex = -1;
+ if (b) {
+ nsindex = namespace_index(self, attr + 1, b - attr - 1);
+ attr_name = b + 1;
+ }
+ if (!write_str_literal(self, "[")) goto end;
+ if (!write_string_as_json(self, attr_name)) goto end;
+ if (!write_str_literal(self, ",")) goto end;
+ if (!write_string_as_json(self, val)) goto end;
+ if (nsindex > -1) {
+ char buf[32];
+ write_data(self, buf, snprintf(buf, sizeof(buf), ",%d", nsindex));
+ }
+ if (!write_str_literal(self, "]")) goto end;
+
+end:
+#if PY_MAJOR_VERSION < 3
+ PyMem_Free(attr); PyMem_Free(val);
+#endif
+ return PyErr_Occurred() ? false : true;
+}
+
+static PyObject*
+start_tag(Serializer *self, PyObject *args) {
+ const char *tag, *text, *tail;
+ PyObject *items;
+#if PY_MAJOR_VERSION > 2
+ if (!PyArg_ParseTuple(args, "zzzO!", &tag, &text, &tail, &PyList_Type, &items)) return NULL;
+#else
+ if (!PyArg_ParseTuple(args, "etetetO!", "UTF-8", &tag, "UTF-8", &text, "UTF-8", &tail, &PyList_Type, &items)) return NULL;
+#endif
+ Py_ssize_t num_attrs = PyList_Size(items);
+ const char *b = strrchr(tag, '}');
+ const char *tag_name = tag;
+ int nsindex = -1;
+ if (b) {
+ nsindex = namespace_index(self, tag + 1, b - tag - 1);
+ tag_name = b + 1;
+ }
+ if (!write_str_literal(self, "{\"n\":")) goto end;
+ if (!write_string_as_json(self, tag_name)) goto end;
+ if (nsindex > -1) {
+ char buf[32];
+ write_data(self, buf, snprintf(buf, sizeof(buf), ",\"s\":%d", nsindex));
+ }
+ if (text) {
+ if (!write_str_literal(self, ",\"x\":")) goto end;
+ if (!write_string_as_json(self, text)) goto end;
+ }
+ if (tail) {
+ if (!write_str_literal(self, ",\"l\":")) goto end;
+ if (!write_string_as_json(self, tail)) goto end;
+ }
+ if (num_attrs > 0) {
+ if (!write_str_literal(self, ",\"a\":[")) goto end;
+ for (Py_ssize_t i = 0; i < num_attrs; i++) {
+ if (i) { if (!write_str_literal(self, ",")) goto end; }
+ if (!write_attr(self, PyList_GET_ITEM(items, i))) goto end;
+ }
+ if (!write_str_literal(self, "]")) goto end;
+ }
+
+end:
+#if PY_MAJOR_VERSION < 3
+ PyMem_Free(tag); PyMem_Free(text); PyMem_Free(tail);
+#endif
+ if (PyErr_Occurred()) return NULL;
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+add_comment(Serializer *self, PyObject *args) {
+ const char *text, *tail, *type;
+#if PY_MAJOR_VERSION > 2
+ if (!PyArg_ParseTuple(args, "zzs", &text, &tail, &type)) return NULL;
+#else
+ if (!PyArg_ParseTuple(args, "etets", "UTF-8", &text, "UTF-8", &tail, &type)) return NULL;
+#endif
+ if (!write_str_literal(self, "{\"s\":")) goto end;
+ if (!write_string_as_json(self, type)) goto end;
+ if (text) {
+ if (!write_str_literal(self, ",\"x\":")) goto end;
+ if (!write_string_as_json(self, text)) goto end;
+ }
+ if (tail) {
+ if (!write_str_literal(self, ",\"l\":")) goto end;
+ if (!write_string_as_json(self, tail)) goto end;
+ }
+ if (!write_str_literal(self, "}")) goto end;
+end:
+#if PY_MAJOR_VERSION < 3
+ PyMem_Free(text); PyMem_Free(tail);
+#endif
+ if (PyErr_Occurred()) return NULL;
+ Py_RETURN_NONE;
+}
+
+static PyObject*
+add_nsmap(Serializer *self, PyObject *args) {
+ (void)args;
+ if (!write_str_literal(self, "[")) return NULL;
+ bool is_first = true;
+ for (auto x : *self->nsmap) {
+ if (is_first) is_first = false;
+ else if (!write_str_literal(self, ",")) return NULL;
+ if (!write_string_as_json(self, x.c_str())) return NULL;
+ }
+ if (!write_str_literal(self, "]")) return NULL;
+ Py_RETURN_NONE;
+}
+
static PyObject*
done(Serializer *self, PyObject *arg) {
(void)arg;
@@ -92,15 +401,24 @@ done(Serializer *self, PyObject *arg) {
PyObject *ans = self->buf;
self->buf = NULL;
self->used = 0;
+ self->nsmap->clear();
return ans;
}
-// Type definition {{{
-
+// Boilerplate {{{
static PyMethodDef Serializer_methods[] = {
+ {"start_tag", (PyCFunction)start_tag, METH_VARARGS,
+ "Start serializing a tag"
+ },
+ {"add_comment", (PyCFunction)add_comment, METH_VARARGS,
+ "Add a comment"
+ },
{"write", (PyCFunction)pywrite, METH_O,
"Write the specified unicode or bytes object"
},
+ {"add_nsmap", (PyCFunction)add_nsmap, METH_NOARGS,
+ "Add the namespace map"
+ },
{"done", (PyCFunction)done, METH_NOARGS,
"Get the serialized output"
},
@@ -147,7 +465,6 @@ PyTypeObject SerializerType = {
/* tp_alloc */ 0,
/* tp_new */ alloc,
};
-// }}}
static char doc[] = "Serialize HTML as JSON efficiently";
static PyMethodDef methods[] = {
@@ -194,3 +511,4 @@ CALIBRE_MODINIT_FUNC inithtml_as_json(void) {
return m;
#endif
}
+// }}}
diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py
index 40f5285d06..f17ffe087b 100644
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@@ -15,6 +15,7 @@ from datetime import datetime
from functools import partial
from itertools import count
from math import ceil
+from lxml.etree import Comment
from css_parser import replaceUrls
from css_parser.css import CSSRule
@@ -243,10 +244,6 @@ def toc_anchor_map(toc):
return dict(ans)
-def serialize_parsed_html(root):
- return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
-
-
class SimpleContainer(ContainerBase):
tweak_mode = True
@@ -416,7 +413,7 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
- shtml = serialize_parsed_html(root)
+ shtml = html_as_json(root)
with container.open(name, 'wb') as f:
f.write(shtml)
@@ -757,6 +754,47 @@ def ensure_body(root):
body.append(div)
+def html_as_json(root):
+ try:
+ Serializer = plugins['html_as_json'][0].Serializer
+ except KeyError:
+ return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
+ s = Serializer()
+ s.write(b'{"version":1,"tree":')
+ stack = [root]
+
+ while stack:
+ elem = stack.pop()
+ if isinstance(elem, bytes):
+ s.write(elem)
+ continue
+ tag = getattr(elem, 'tag', html_as_json)
+ if callable(tag):
+ if tag is Comment:
+ s.add_comment(elem.text, elem.tail, 'c')
+ else:
+ tail = getattr(elem, 'tail', None)
+ if tail:
+ s.add_comment(None, tail, 'o')
+ continue
+ s.start_tag(elem.tag, elem.text, elem.tail, elem.items())
+ children = tuple(elem.iterchildren())
+ if children:
+ s.write(b',"c":[')
+ stack.append(b']}')
+ first_child = children[0]
+ for c in reversed(children):
+ stack.append(c)
+ if c is not first_child:
+ stack.append(b',')
+ else:
+ s.write(b'}')
+ s.write(b',"nsmap":')
+ s.add_nsmap()
+ s.write(b'}')
+ return s.done()
+
+
def html_as_dict(root):
ensure_body(root)
for child in tuple(root.iterchildren('*')):
diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py
index b3543f4903..9b0b24f20e 100644
--- a/src/calibre/srv/tests/content.py
+++ b/src/calibre/srv/tests/content.py
@@ -240,6 +240,8 @@ class ContentTest(LibraryBaseTest):
def test_html_as_json(self): # {{{
from calibre.constants import plugins
+ from calibre.srv.render_book import html_as_json
+ from calibre.ebooks.oeb.parse_utils import html5_parse
Serializer = plugins['html_as_json'][0].Serializer
s = Serializer()
d = 'a' * (127 * 1024)
@@ -247,4 +249,23 @@ class ContentTest(LibraryBaseTest):
d = d.encode('ascii')
s.write(d)
self.ae(s.done(), (d + d))
+
+ def t(html, body_children, nsmap=('http://www.w3.org/1999/xhtml',)):
+ root = html5_parse(html)
+ raw = html_as_json(root)
+ # print(raw.decode('utf-8'))
+ data = json.loads(raw)
+ self.ae(data['version'], 1)
+ self.ae(tuple(data['nsmap']), nsmap)
+ bc = data['tree']['c'][1]['c']
+ self.ae(bc, body_children)
+
+ t('at
l', [{"n":"p","s":0,"x":"a","l":"l","c":[{"s":"c","x":"c","l":"t"}]}])
+ t('a', [{"n":"p","s":0,"x":"a","a":[['class','foo'],['id','bar']]}])
+ t(
+ '', [{'n': 'svg', 's': 1, 'a': [['href', 'h', 2]]}],
+ ('http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1999/xlink')
+ )
+ text = 'š\n\t\\mÅ«s"'
+ t("
PeƱa".format(text), [{"n":"p","s":0,"x":"PeƱa","a":[['id',text]]}])
# }}}