children;
+ Py_INCREF(root);
+ stack.push(pyunique_ptr(root));
+ write_str_literal("{\"version\":1,\"tree\":");
+
+ while(!stack.empty()) {
+ pyunique_ptr e(std::move(stack.top()));
+ stack.pop();
+ PyObject *elem = e.get();
+ if (PyBytes_CheckExact(elem)) {
+ if (!this->write_data(PyBytes_AS_STRING(elem), PyBytes_GET_SIZE(elem))) return NULL;
+ continue;
+ }
+ StringOrNone tag(PyObject_GetAttrString(elem, "tag"));
+ StringOrNone text(PyObject_GetAttrString(elem, "text")), tail(PyObject_GetAttrString(elem, "tail"));
+ if (!tag || PyCallable_Check(tag.get())) {
+ const char *type = (tag && tag.get() == Comment) ? "c" : "o";
+ if (!this->add_comment(text.c_str(), tail.c_str(), type)) return NULL;
+ } else {
+ pyunique_ptr attrs(PyObject_CallMethod(elem, "items", NULL));
+ if (!attrs) return NULL;
+ if (!this->start_tag(tag.c_str(), text.c_str(), tail.c_str(), attrs.get())) return NULL;
+ pyunique_ptr iterator(PyObject_GetIter(elem));
+ if (!iterator) return NULL;
+ children.clear();
+ while(true) {
+ PyObject *child = PyIter_Next(iterator.get());
+ if (!child) { if (PyErr_Occurred()) return NULL; break; }
+ children.push_back(pyunique_ptr(child));
+ }
+ if (children.size() > 0) {
+#define push_literal(x) { \
+ PyObject *lt = PyBytes_FromStringAndSize(x, sizeof(x) - 1); \
+ if (!lt) return NULL; \
+ stack.push(pyunique_ptr(lt));}
+ if (!write_str_literal(",\"c\":[")) return NULL;
+ push_literal("]}");
+ for (size_t i = children.size(); i-- > 0;) {
+ stack.push(std::move(children[i]));
+ if (i != 0) push_literal(",");
+ }
+#undef push_literal
+ } else if (!write_str_literal("}")) return NULL;
+ }
+ }
+ if (!write_str_literal(",\"nsmap\":")) return NULL;
+ if (!this->add_nsmap()) return NULL;
+ if (!write_str_literal("}")) return NULL;
+
+ if (_PyBytes_Resize(&this->buf, this->used) != 0) return NULL;
+ PyObject *ans = this->buf;
+ this->buf = NULL;
+ this->used = 0;
+ this->nsmap.clear();
+ return ans;
+ }
+};
+
static PyObject*
-done(Serializer *self, PyObject *arg) {
- (void)arg;
- if (!self->buf) return PyBytes_FromString("");
- if (_PyBytes_Resize(&self->buf, self->used) != 0) return NULL;
- PyObject *ans = self->buf;
- self->buf = NULL;
- self->used = 0;
- self->nsmap->clear();
- return ans;
+serialize(PyObject *self, PyObject *args) {
+ (void)self;
+ try {
+ Serializer s;
+ return s.serialize(args);
+ } catch(const std::exception & err) {
+ PyErr_Format(PyExc_ValueError, "An error occurred while trying to serialize to JSON: %s", err.what());
+ return NULL;
+ } catch (...) {
+ PyErr_SetString(PyExc_ValueError, "An unknown error occurred while trying to serialize to JSON");
+ return NULL;
+ }
}
// Boilerplate {{{
-static PyMethodDef Serializer_methods[] = {
- {"start_tag", (PyCFunction)start_tag, METH_VARARGS,
- "Start serializing a tag"
- },
- {"add_comment", (PyCFunction)add_comment, METH_VARARGS,
- "Add a comment"
- },
- {"write", (PyCFunction)pywrite, METH_O,
- "Write the specified unicode or bytes object"
- },
- {"add_nsmap", (PyCFunction)add_nsmap, METH_NOARGS,
- "Add the namespace map"
- },
- {"done", (PyCFunction)done, METH_NOARGS,
- "Get the serialized output"
- },
- {NULL} /* Sentinel */
-};
-
-PyTypeObject SerializerType = {
- PyVarObject_HEAD_INIT(NULL, 0)
- /* tp_name */ "html_as_json.Serializer",
- /* tp_basicsize */ sizeof(Serializer),
- /* tp_itemsize */ 0,
- /* tp_dealloc */ (destructor)dealloc,
- /* tp_print */ 0,
- /* tp_getattr */ 0,
- /* tp_setattr */ 0,
- /* tp_compare */ 0,
- /* tp_repr */ 0,
- /* tp_as_number */ 0,
- /* tp_as_sequence */ 0,
- /* tp_as_mapping */ 0,
- /* tp_hash */ 0,
- /* tp_call */ 0,
- /* tp_str */ 0,
- /* tp_getattro */ 0,
- /* tp_setattro */ 0,
- /* tp_as_buffer */ 0,
- /* tp_flags */ Py_TPFLAGS_DEFAULT,
- /* tp_doc */ "Serializer",
- /* tp_traverse */ 0,
- /* tp_clear */ 0,
- /* tp_richcompare */ 0,
- /* tp_weaklistoffset */ 0,
- /* tp_iter */ 0,
- /* tp_iternext */ 0,
- /* tp_methods */ Serializer_methods,
- /* tp_members */ 0,
- /* tp_getset */ 0,
- /* tp_base */ 0,
- /* tp_dict */ 0,
- /* tp_descr_get */ 0,
- /* tp_descr_set */ 0,
- /* tp_dictoffset */ 0,
- /* tp_init */ 0,
- /* tp_alloc */ 0,
- /* tp_new */ alloc,
-};
-
static char doc[] = "Serialize HTML as JSON efficiently";
static PyMethodDef methods[] = {
+ {"serialize", (PyCFunction)serialize, METH_VARARGS,
+ "Serialize the provided lxml tree to JSON"
+ },
{NULL} /* Sentinel */
};
@@ -494,19 +465,11 @@ CALIBRE_MODINIT_FUNC inithtml_as_json(void) {
PyObject* m;
- if (PyType_Ready(&SerializerType) < 0) {
- INITERROR;
- }
-
-
m = INITMODULE;
if (m == NULL) {
INITERROR;
}
- PyModule_AddObject(m, "Serializer", (PyObject *)&SerializerType);
-
-
#if PY_MAJOR_VERSION >= 3
return m;
#endif
diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py
index f17ffe087b..257077ad24 100644
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@@ -756,43 +756,15 @@ def ensure_body(root):
def html_as_json(root):
try:
- Serializer = plugins['html_as_json'][0].Serializer
+ serialize = plugins['html_as_json'][0].serialize
except KeyError:
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
- s = Serializer()
- s.write(b'{"version":1,"tree":')
- stack = [root]
-
- while stack:
- elem = stack.pop()
- if isinstance(elem, bytes):
- s.write(elem)
- continue
- tag = getattr(elem, 'tag', html_as_json)
- if callable(tag):
- if tag is Comment:
- s.add_comment(elem.text, elem.tail, 'c')
- else:
- tail = getattr(elem, 'tail', None)
- if tail:
- s.add_comment(None, tail, 'o')
- continue
- s.start_tag(elem.tag, elem.text, elem.tail, elem.items())
- children = tuple(elem.iterchildren())
- if children:
- s.write(b',"c":[')
- stack.append(b']}')
- first_child = children[0]
- for c in reversed(children):
- stack.append(c)
- if c is not first_child:
- stack.append(b',')
- else:
- s.write(b'}')
- s.write(b',"nsmap":')
- s.add_nsmap()
- s.write(b'}')
- return s.done()
+ ensure_body(root)
+ for child in tuple(root.iterchildren('*')):
+ if child.tag.partition('}')[-1] not in ('head', 'body'):
+ root.remove(child)
+ root.text = root.tail = None
+ return serialize(root, Comment)
def html_as_dict(root):
diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py
index 9b0b24f20e..8b5dcd4c11 100644
--- a/src/calibre/srv/tests/content.py
+++ b/src/calibre/srv/tests/content.py
@@ -239,16 +239,8 @@ class ContentTest(LibraryBaseTest):
# }}}
def test_html_as_json(self): # {{{
- from calibre.constants import plugins
from calibre.srv.render_book import html_as_json
from calibre.ebooks.oeb.parse_utils import html5_parse
- Serializer = plugins['html_as_json'][0].Serializer
- s = Serializer()
- d = 'a' * (127 * 1024)
- s.write(d)
- d = d.encode('ascii')
- s.write(d)
- self.ae(s.done(), (d + d))
def t(html, body_children, nsmap=('http://www.w3.org/1999/xhtml',)):
root = html5_parse(html)
@@ -268,4 +260,6 @@ class ContentTest(LibraryBaseTest):
)
text = 'š\n\t\\mÅ«s"'
t("PeƱa".format(text), [{"n":"p","s":0,"x":"PeƱa","a":[['id',text]]}])
+ text = 'a' * (127 * 1024)
+ t('
{0}
{0}'.format(text), [{"n":"p","s":0,"x":text}, {'n':'p','s':0,'x':text}])
# }}}