From e33c18459a9c990cab71e7ed53adcc9fc3c64e3d Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 29 Oct 2019 12:50:26 +0530
Subject: [PATCH] Finish fast serialization of html to json

---
 src/calibre/srv/html_as_json.cpp | 354 +++++++++++++++++++++++++++++--
 src/calibre/srv/render_book.py   |  48 ++++-
 src/calibre/srv/tests/content.py |  21 ++
 3 files changed, 400 insertions(+), 23 deletions(-)
diff --git a/src/calibre/srv/html_as_json.cpp b/src/calibre/srv/html_as_json.cpp
index 7187cffe4e..ba555b323e 100644
--- a/src/calibre/srv/html_as_json.cpp
+++ b/src/calibre/srv/html_as_json.cpp
@@ -7,15 +7,28 @@
 
 #include <Python.h>
 #include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+#include <string>
 
 typedef struct {
     PyObject_HEAD
     /* Type-specific fields go here. */
 	PyObject *buf;
 	size_t used;
+	std::vector<std::string> *nsmap;
 } Serializer;
 
 
+static void
+dealloc(Serializer* self)
+{
+	Py_CLEAR(self->buf);
+	if (self->nsmap) delete self->nsmap;
+    Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
 static PyObject *
 alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
@@ -25,31 +38,188 @@ alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
 	if (self != NULL) {
 		self->used = 0;
 		self->buf = NULL;
+		self->nsmap = new (std::nothrow) std::vector<std::string>();
+		if (!self->nsmap) { PyErr_NoMemory(); dealloc(self); self = NULL; }
 	}
     return (PyObject *)self;
 }
 
 
-static void
-dealloc(Serializer* self)
-{
-	Py_CLEAR(self->buf);
-    Py_TYPE(self)->tp_free((PyObject*)self);
+static inline bool
+ensure_space(Serializer *self, size_t amt) {
+	size_t required = amt + self->used;
+	if (!self->buf) {
+		self->buf = PyBytes_FromStringAndSize(NULL, std::max(required, static_cast<size_t>(128u * 1024u)));
+		if (!self->buf) return false;
+		return true;
+	}
+
+	if (required > static_cast<size_t>(PyBytes_GET_SIZE(self->buf))) {
+		if (_PyBytes_Resize(&(self->buf), std::max(required, static_cast<size_t>(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false;
+	}
+	return true;
+}
+
+static bool
+write_data(Serializer *self, const char *data, size_t sz) {
+	if (!ensure_space(self, sz)) return false;
+	memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
+	self->used += sz;
+	return true;
+}
+
+#define write_str_literal(self, x) write_data(self, x, sizeof(x)-1)
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static inline void
+utf8_decode_(uint32_t* state, uint32_t* codep, uint8_t byte) {
+  /* Comes from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+   * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+   * Used under license: https://opensource.org/licenses/MIT
+   */
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state*16 + type];
+}
+
+static inline unsigned
+utf8_read_char(const char *s, uint32_t *codep) {
+	unsigned len = 0;
+	uint32_t state = UTF8_ACCEPT;
+	while(true) {
+		utf8_decode_(&state, codep, s[len++]);
+		if (state == UTF8_ACCEPT) break;
+		else if (state == UTF8_REJECT) { return 0; }
+	}
+	return len;
+}
+
+static inline void
+to_surrogate_pair(uint32_t unicode, uint16_t *uc, uint16_t *lc) {
+	uint32_t n = unicode - 0x10000;
+	*uc = ((n >> 10) & 0x3FF) | 0xD800;
+	*lc = (n & 0x3FF) | 0xDC00;
+}
+
+static inline unsigned
+write_hex16(char *out, uint16_t val) {
+	static const char *hex = "0123456789ABCDEF";
+
+	*out++ = hex[(val >> 12) & 0xF];
+	*out++ = hex[(val >> 8)  & 0xF];
+	*out++ = hex[(val >> 4)  & 0xF];
+	*out++ = hex[ val        & 0xF];
+
+	return 4;
 }
 
 
 static bool
-write_data(Serializer *self, const char *data, size_t sz) {
-	if (!self->buf) {
-		self->buf = PyBytes_FromStringAndSize(NULL, std::max(sz, static_cast<size_t>(128u * 1024u)));
-		if (!self->buf) return false;
+write_string_as_json(Serializer *self, const char *str)
+{
+	const char *s = str;
+	if (!ensure_space(self, 32)) return false;
+	char *b = PyBytes_AS_STRING(self->buf) + self->used;
+
+	*b++ = '"';
+	while (*s != 0) {
+		unsigned char c = *s++;
+
+		/* Encode the next character, and write it to b. */
+		switch (c) {
+			case '"':
+				*b++ = '\\';
+				*b++ = '"';
+				break;
+			case '\\':
+				*b++ = '\\';
+				*b++ = '\\';
+				break;
+			case '\b':
+				*b++ = '\\';
+				*b++ = 'b';
+				break;
+			case '\f':
+				*b++ = '\\';
+				*b++ = 'f';
+				break;
+			case '\n':
+				*b++ = '\\';
+				*b++ = 'n';
+				break;
+			case '\r':
+				*b++ = '\\';
+				*b++ = 'r';
+				break;
+			case '\t':
+				*b++ = '\\';
+				*b++ = 't';
+				break;
+			default: {
+				s--;
+				uint32_t unicode;
+				unsigned len = utf8_read_char(s, &unicode);
+				if (len == 0) s++;
+				else if (c < 0x1F) {
+					/* Encode using \u.... */
+					s += len;
+					if (unicode <= 0xFFFF) {
+						*b++ = '\\';
+						*b++ = 'u';
+						b += write_hex16(b, unicode);
+					} else {
+						/* Produce a surrogate pair. */
+						uint16_t uc, lc;
+						to_surrogate_pair(unicode, &uc, &lc);
+						*b++ = '\\';
+						*b++ = 'u';
+						b += write_hex16(b, uc);
+						*b++ = '\\';
+						*b++ = 'u';
+						b += write_hex16(b, lc);
+					}
+				} else {
+					/* Write the character directly. */
+					while (len-- > 0) *b++ = *s++;
+				}
+
+				break;
+			}
+		}
+
+		/*
+		 * Update self to know about the new bytes,
+		 * and set up b to write another encoded character.
+		 */
+		self->used = b - PyBytes_AS_STRING(self->buf);
+		if (!ensure_space(self, 32)) return false;
+		b = PyBytes_AS_STRING(self->buf) + self->used;
 	}
-	size_t new_used = self->used + sz;
-	if (new_used > static_cast<size_t>(PyBytes_GET_SIZE(self->buf))) {
-		if (_PyBytes_Resize(&(self->buf), std::max(new_used, static_cast<size_t>(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false;
-	}
-	memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
-	self->used = new_used;
+	*b++ = '"';
+	self->used = b - PyBytes_AS_STRING(self->buf);
 	return true;
 }
 
@@ -84,6 +254,145 @@ pywrite(Serializer *self, PyObject *arg) {
 	Py_RETURN_NONE;
 }
 
+static inline bool
+namespaces_are_equal(const char *a, const char *b, size_t len) {
+	for (size_t i = 0; i < len; i++) {
+		if (a[i] != b[i]) return false;
+		if (!b[i]) return true;
+	}
+	return true;
+}
+
+static inline int
+namespace_index(Serializer *self, const char *ns, size_t nslen) {
+	for (size_t i = 0; i < self->nsmap->size(); i++) {
+		if (namespaces_are_equal((*self->nsmap)[i].c_str(), ns, nslen)) return i;
+	}
+	self->nsmap->push_back(std::string(ns, nslen));
+	return self->nsmap->size() - 1;
+}
+
+static bool
+write_attr(Serializer *self, PyObject *args) {
+	const char *attr, *val;
+#if PY_MAJOR_VERSION > 2
+	if (!PyArg_ParseTuple(args, "ss", &attr, &val)) return false;
+#else
+	if (!PyArg_ParseTuple(args, "eses", "UTF-8", &attr, "UTF-8", &val)) return false;
+#endif
+	const char *b = strrchr(attr, '}');
+	const char *attr_name = attr;
+	int nsindex = -1;
+	if (b) {
+		nsindex = namespace_index(self, attr + 1, b - attr - 1);
+		attr_name = b + 1;
+	}
+	if (!write_str_literal(self, "[")) goto end;
+	if (!write_string_as_json(self, attr_name)) goto end;
+	if (!write_str_literal(self, ",")) goto end;
+	if (!write_string_as_json(self, val)) goto end;
+	if (nsindex > -1) {
+		char buf[32];
+		write_data(self, buf, snprintf(buf, sizeof(buf), ",%d", nsindex));
+	}
+	if (!write_str_literal(self, "]")) goto end;
+
+end:
+#if PY_MAJOR_VERSION < 3
+	PyMem_Free(attr); PyMem_Free(val);
+#endif
+	return PyErr_Occurred() ? false : true;
+}
+
+static PyObject*
+start_tag(Serializer *self, PyObject *args) {
+	const char *tag, *text, *tail;
+	PyObject *items;
+#if PY_MAJOR_VERSION > 2
+	if (!PyArg_ParseTuple(args, "zzzO!", &tag, &text, &tail, &PyList_Type, &items)) return NULL;
+#else
+	if (!PyArg_ParseTuple(args, "etetetO!", "UTF-8", &tag, "UTF-8", &text, "UTF-8", &tail, &PyList_Type, &items)) return NULL;
+#endif
+	Py_ssize_t num_attrs = PyList_Size(items);
+	const char *b = strrchr(tag, '}');
+	const char *tag_name = tag;
+	int nsindex = -1;
+	if (b) {
+		nsindex = namespace_index(self, tag + 1, b - tag - 1);
+		tag_name = b + 1;
+	}
+	if (!write_str_literal(self, "{\"n\":")) goto end;
+	if (!write_string_as_json(self, tag_name)) goto end;
+	if (nsindex > -1) {
+		char buf[32];
+		write_data(self, buf, snprintf(buf, sizeof(buf), ",\"s\":%d", nsindex));
+	}
+	if (text) {
+		if (!write_str_literal(self, ",\"x\":")) goto end;
+		if (!write_string_as_json(self, text)) goto end;
+	}
+	if (tail) {
+		if (!write_str_literal(self, ",\"l\":")) goto end;
+		if (!write_string_as_json(self, tail)) goto end;
+	}
+	if (num_attrs > 0) {
+		if (!write_str_literal(self, ",\"a\":[")) goto end;
+		for (Py_ssize_t i = 0; i < num_attrs; i++) {
+			if (i) { if (!write_str_literal(self, ",")) goto end; }
+			if (!write_attr(self, PyList_GET_ITEM(items, i))) goto end;
+		}
+		if (!write_str_literal(self, "]")) goto end;
+	}
+
+end:
+#if PY_MAJOR_VERSION < 3
+	PyMem_Free(tag); PyMem_Free(text); PyMem_Free(tail);
+#endif
+	if (PyErr_Occurred()) return NULL;
+	Py_RETURN_NONE;
+}
+
+static PyObject*
+add_comment(Serializer *self, PyObject *args) {
+	const char *text, *tail, *type;
+#if PY_MAJOR_VERSION > 2
+	if (!PyArg_ParseTuple(args, "zzs", &text, &tail, &type)) return NULL;
+#else
+	if (!PyArg_ParseTuple(args, "etets", "UTF-8", &text, "UTF-8", &tail, &type)) return NULL;
+#endif
+	if (!write_str_literal(self, "{\"s\":")) goto end;
+	if (!write_string_as_json(self, type)) goto end;
+	if (text) {
+		if (!write_str_literal(self, ",\"x\":")) goto end;
+		if (!write_string_as_json(self, text)) goto end;
+	}
+	if (tail) {
+		if (!write_str_literal(self, ",\"l\":")) goto end;
+		if (!write_string_as_json(self, tail)) goto end;
+	}
+	if (!write_str_literal(self, "}")) goto end;
+end:
+#if PY_MAJOR_VERSION < 3
+	PyMem_Free(text); PyMem_Free(tail);
+#endif
+	if (PyErr_Occurred()) return NULL;
+	Py_RETURN_NONE;
+}
+
+static PyObject*
+add_nsmap(Serializer *self, PyObject *args) {
+	(void)args;
+	if (!write_str_literal(self, "[")) return NULL;
+	bool is_first = true;
+	for (auto x : *self->nsmap) {
+		if (is_first) is_first = false;
+		else if (!write_str_literal(self, ",")) return NULL;
+		if (!write_string_as_json(self, x.c_str())) return NULL;
+	}
+	if (!write_str_literal(self, "]")) return NULL;
+	Py_RETURN_NONE;
+}
+
 static PyObject*
 done(Serializer *self, PyObject *arg) {
 	(void)arg;
@@ -92,15 +401,24 @@ done(Serializer *self, PyObject *arg) {
 	PyObject *ans = self->buf;
 	self->buf = NULL;
 	self->used = 0;
+	self->nsmap->clear();
 	return ans;
 }
 
-// Type definition {{{
-
+// Boilerplate {{{
 static PyMethodDef Serializer_methods[] = {
+    {"start_tag", (PyCFunction)start_tag, METH_VARARGS,
+     "Start serializing a tag"
+    },
+    {"add_comment", (PyCFunction)add_comment, METH_VARARGS,
+     "Add a comment"
+    },
     {"write", (PyCFunction)pywrite, METH_O,
      "Write the specified unicode or bytes object"
     },
+    {"add_nsmap", (PyCFunction)add_nsmap, METH_NOARGS,
+     "Add the namespace map"
+    },
     {"done", (PyCFunction)done, METH_NOARGS,
      "Get the serialized output"
     },
@@ -147,7 +465,6 @@ PyTypeObject SerializerType = {
     /* tp_alloc          */ 0,
     /* tp_new            */ alloc,
 };
-// }}}
 
 static char doc[] = "Serialize HTML as JSON efficiently";
 static PyMethodDef methods[] = {
@@ -194,3 +511,4 @@ CALIBRE_MODINIT_FUNC inithtml_as_json(void) {
     return m;
 #endif
 }
+// }}}
diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py
index 40f5285d06..f17ffe087b 100644
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@@ -15,6 +15,7 @@ from datetime import datetime
 from functools import partial
 from itertools import count
 from math import ceil
+from lxml.etree import Comment
 
 from css_parser import replaceUrls
 from css_parser.css import CSSRule
@@ -243,10 +244,6 @@ def toc_anchor_map(toc):
     return dict(ans)
 
 
-def serialize_parsed_html(root):
-    return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
-
-
 class SimpleContainer(ContainerBase):
 
     tweak_mode = True
@@ -416,7 +413,7 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
                 link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
                 a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
 
-    shtml = serialize_parsed_html(root)
+    shtml = html_as_json(root)
     with container.open(name, 'wb') as f:
         f.write(shtml)
 
@@ -757,6 +754,47 @@ def ensure_body(root):
             body.append(div)
 
 
+def html_as_json(root):
+    try:
+        Serializer = plugins['html_as_json'][0].Serializer
+    except KeyError:
+        return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
+    s = Serializer()
+    s.write(b'{"version":1,"tree":')
+    stack = [root]
+
+    while stack:
+        elem = stack.pop()
+        if isinstance(elem, bytes):
+            s.write(elem)
+            continue
+        tag = getattr(elem, 'tag', html_as_json)
+        if callable(tag):
+            if tag is Comment:
+                s.add_comment(elem.text, elem.tail, 'c')
+            else:
+                tail = getattr(elem, 'tail', None)
+                if tail:
+                    s.add_comment(None, tail, 'o')
+            continue
+        s.start_tag(elem.tag, elem.text, elem.tail, elem.items())
+        children = tuple(elem.iterchildren())
+        if children:
+            s.write(b',"c":[')
+            stack.append(b']}')
+            first_child = children[0]
+            for c in reversed(children):
+                stack.append(c)
+                if c is not first_child:
+                    stack.append(b',')
+        else:
+            s.write(b'}')
+    s.write(b',"nsmap":')
+    s.add_nsmap()
+    s.write(b'}')
+    return s.done()
+
+
 def html_as_dict(root):
     ensure_body(root)
     for child in tuple(root.iterchildren('*')):
diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py
index b3543f4903..9b0b24f20e 100644
--- a/src/calibre/srv/tests/content.py
+++ b/src/calibre/srv/tests/content.py
@@ -240,6 +240,8 @@ class ContentTest(LibraryBaseTest):
 
     def test_html_as_json(self):  # {{{
         from calibre.constants import plugins
+        from calibre.srv.render_book import html_as_json
+        from calibre.ebooks.oeb.parse_utils import html5_parse
         Serializer = plugins['html_as_json'][0].Serializer
         s = Serializer()
         d = 'a' * (127 * 1024)
@@ -247,4 +249,23 @@ class ContentTest(LibraryBaseTest):
         d = d.encode('ascii')
         s.write(d)
         self.ae(s.done(), (d + d))
+
+        def t(html, body_children, nsmap=('http://www.w3.org/1999/xhtml',)):
+            root = html5_parse(html)
+            raw = html_as_json(root)
+            # print(raw.decode('utf-8'))
+            data = json.loads(raw)
+            self.ae(data['version'], 1)
+            self.ae(tuple(data['nsmap']), nsmap)
+            bc = data['tree']['c'][1]['c']
+            self.ae(bc, body_children)
+
+        t('<p>a<!--c-->t</p>l', [{"n":"p","s":0,"x":"a","l":"l","c":[{"s":"c","x":"c","l":"t"}]}])
+        t('<p class="foo" id="bar">a', [{"n":"p","s":0,"x":"a","a":[['class','foo'],['id','bar']]}])
+        t(
+            '<svg xlink:href="h"></svg>', [{'n': 'svg', 's': 1, 'a': [['href', 'h', 2]]}],
+            ('http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1999/xlink')
+        )
+        text = '🐈\n\t\\mūs"'
+        t("<p id='{}'>Peña".format(text), [{"n":"p","s":0,"x":"Peña","a":[['id',text]]}])
     # }}}