mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Finish fast serialization of html to json
This commit is contained in:
parent
02838a712c
commit
e33c18459a
@ -7,15 +7,28 @@
|
|||||||
|
|
||||||
#include <Python.h>
|
#include <Python.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
/* Type-specific fields go here. */
|
/* Type-specific fields go here. */
|
||||||
PyObject *buf;
|
PyObject *buf;
|
||||||
size_t used;
|
size_t used;
|
||||||
|
std::vector<std::string> *nsmap;
|
||||||
} Serializer;
|
} Serializer;
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
dealloc(Serializer* self)
|
||||||
|
{
|
||||||
|
Py_CLEAR(self->buf);
|
||||||
|
if (self->nsmap) delete self->nsmap;
|
||||||
|
Py_TYPE(self)->tp_free((PyObject*)self);
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||||
{
|
{
|
||||||
@ -25,31 +38,188 @@ alloc(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
|||||||
if (self != NULL) {
|
if (self != NULL) {
|
||||||
self->used = 0;
|
self->used = 0;
|
||||||
self->buf = NULL;
|
self->buf = NULL;
|
||||||
|
self->nsmap = new (std::nothrow) std::vector<std::string>();
|
||||||
|
if (!self->nsmap) { PyErr_NoMemory(); dealloc(self); self = NULL; }
|
||||||
}
|
}
|
||||||
return (PyObject *)self;
|
return (PyObject *)self;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static inline bool
|
||||||
dealloc(Serializer* self)
|
ensure_space(Serializer *self, size_t amt) {
|
||||||
{
|
size_t required = amt + self->used;
|
||||||
Py_CLEAR(self->buf);
|
if (!self->buf) {
|
||||||
Py_TYPE(self)->tp_free((PyObject*)self);
|
self->buf = PyBytes_FromStringAndSize(NULL, std::max(required, static_cast<size_t>(128u * 1024u)));
|
||||||
|
if (!self->buf) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (required > static_cast<size_t>(PyBytes_GET_SIZE(self->buf))) {
|
||||||
|
if (_PyBytes_Resize(&(self->buf), std::max(required, static_cast<size_t>(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
write_data(Serializer *self, const char *data, size_t sz) {
|
||||||
|
if (!ensure_space(self, sz)) return false;
|
||||||
|
memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
|
||||||
|
self->used += sz;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define write_str_literal(self, x) write_data(self, x, sizeof(x)-1)
|
||||||
|
|
||||||
|
#define UTF8_ACCEPT 0
|
||||||
|
#define UTF8_REJECT 1
|
||||||
|
|
||||||
|
static const uint8_t utf8d[] = {
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||||
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||||
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||||
|
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||||
|
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||||
|
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||||
|
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||||
|
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||||
|
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
utf8_decode_(uint32_t* state, uint32_t* codep, uint8_t byte) {
|
||||||
|
/* Comes from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||||
|
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||||
|
* Used under license: https://opensource.org/licenses/MIT
|
||||||
|
*/
|
||||||
|
uint32_t type = utf8d[byte];
|
||||||
|
|
||||||
|
*codep = (*state != UTF8_ACCEPT) ?
|
||||||
|
(byte & 0x3fu) | (*codep << 6) :
|
||||||
|
(0xff >> type) & (byte);
|
||||||
|
|
||||||
|
*state = utf8d[256 + *state*16 + type];
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
utf8_read_char(const char *s, uint32_t *codep) {
|
||||||
|
unsigned len = 0;
|
||||||
|
uint32_t state = UTF8_ACCEPT;
|
||||||
|
while(true) {
|
||||||
|
utf8_decode_(&state, codep, s[len++]);
|
||||||
|
if (state == UTF8_ACCEPT) break;
|
||||||
|
else if (state == UTF8_REJECT) { return 0; }
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
to_surrogate_pair(uint32_t unicode, uint16_t *uc, uint16_t *lc) {
|
||||||
|
uint32_t n = unicode - 0x10000;
|
||||||
|
*uc = ((n >> 10) & 0x3FF) | 0xD800;
|
||||||
|
*lc = (n & 0x3FF) | 0xDC00;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned
|
||||||
|
write_hex16(char *out, uint16_t val) {
|
||||||
|
static const char *hex = "0123456789ABCDEF";
|
||||||
|
|
||||||
|
*out++ = hex[(val >> 12) & 0xF];
|
||||||
|
*out++ = hex[(val >> 8) & 0xF];
|
||||||
|
*out++ = hex[(val >> 4) & 0xF];
|
||||||
|
*out++ = hex[ val & 0xF];
|
||||||
|
|
||||||
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
write_data(Serializer *self, const char *data, size_t sz) {
|
write_string_as_json(Serializer *self, const char *str)
|
||||||
if (!self->buf) {
|
{
|
||||||
self->buf = PyBytes_FromStringAndSize(NULL, std::max(sz, static_cast<size_t>(128u * 1024u)));
|
const char *s = str;
|
||||||
if (!self->buf) return false;
|
if (!ensure_space(self, 32)) return false;
|
||||||
|
char *b = PyBytes_AS_STRING(self->buf) + self->used;
|
||||||
|
|
||||||
|
*b++ = '"';
|
||||||
|
while (*s != 0) {
|
||||||
|
unsigned char c = *s++;
|
||||||
|
|
||||||
|
/* Encode the next character, and write it to b. */
|
||||||
|
switch (c) {
|
||||||
|
case '"':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = '"';
|
||||||
|
break;
|
||||||
|
case '\\':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = '\\';
|
||||||
|
break;
|
||||||
|
case '\b':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'b';
|
||||||
|
break;
|
||||||
|
case '\f':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'f';
|
||||||
|
break;
|
||||||
|
case '\n':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'n';
|
||||||
|
break;
|
||||||
|
case '\r':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'r';
|
||||||
|
break;
|
||||||
|
case '\t':
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 't';
|
||||||
|
break;
|
||||||
|
default: {
|
||||||
|
s--;
|
||||||
|
uint32_t unicode;
|
||||||
|
unsigned len = utf8_read_char(s, &unicode);
|
||||||
|
if (len == 0) s++;
|
||||||
|
else if (c < 0x1F) {
|
||||||
|
/* Encode using \u.... */
|
||||||
|
s += len;
|
||||||
|
if (unicode <= 0xFFFF) {
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'u';
|
||||||
|
b += write_hex16(b, unicode);
|
||||||
|
} else {
|
||||||
|
/* Produce a surrogate pair. */
|
||||||
|
uint16_t uc, lc;
|
||||||
|
to_surrogate_pair(unicode, &uc, &lc);
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'u';
|
||||||
|
b += write_hex16(b, uc);
|
||||||
|
*b++ = '\\';
|
||||||
|
*b++ = 'u';
|
||||||
|
b += write_hex16(b, lc);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* Write the character directly. */
|
||||||
|
while (len-- > 0) *b++ = *s++;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Update self to know about the new bytes,
|
||||||
|
* and set up b to write another encoded character.
|
||||||
|
*/
|
||||||
|
self->used = b - PyBytes_AS_STRING(self->buf);
|
||||||
|
if (!ensure_space(self, 32)) return false;
|
||||||
|
b = PyBytes_AS_STRING(self->buf) + self->used;
|
||||||
}
|
}
|
||||||
size_t new_used = self->used + sz;
|
*b++ = '"';
|
||||||
if (new_used > static_cast<size_t>(PyBytes_GET_SIZE(self->buf))) {
|
self->used = b - PyBytes_AS_STRING(self->buf);
|
||||||
if (_PyBytes_Resize(&(self->buf), std::max(new_used, static_cast<size_t>(2 * PyBytes_GET_SIZE(self->buf)))) != 0) return false;
|
|
||||||
}
|
|
||||||
memcpy(PyBytes_AS_STRING(self->buf) + self->used, data, sz);
|
|
||||||
self->used = new_used;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,6 +254,145 @@ pywrite(Serializer *self, PyObject *arg) {
|
|||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool
|
||||||
|
namespaces_are_equal(const char *a, const char *b, size_t len) {
|
||||||
|
for (size_t i = 0; i < len; i++) {
|
||||||
|
if (a[i] != b[i]) return false;
|
||||||
|
if (!b[i]) return true;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int
|
||||||
|
namespace_index(Serializer *self, const char *ns, size_t nslen) {
|
||||||
|
for (size_t i = 0; i < self->nsmap->size(); i++) {
|
||||||
|
if (namespaces_are_equal((*self->nsmap)[i].c_str(), ns, nslen)) return i;
|
||||||
|
}
|
||||||
|
self->nsmap->push_back(std::string(ns, nslen));
|
||||||
|
return self->nsmap->size() - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
write_attr(Serializer *self, PyObject *args) {
|
||||||
|
const char *attr, *val;
|
||||||
|
#if PY_MAJOR_VERSION > 2
|
||||||
|
if (!PyArg_ParseTuple(args, "ss", &attr, &val)) return false;
|
||||||
|
#else
|
||||||
|
if (!PyArg_ParseTuple(args, "eses", "UTF-8", &attr, "UTF-8", &val)) return false;
|
||||||
|
#endif
|
||||||
|
const char *b = strrchr(attr, '}');
|
||||||
|
const char *attr_name = attr;
|
||||||
|
int nsindex = -1;
|
||||||
|
if (b) {
|
||||||
|
nsindex = namespace_index(self, attr + 1, b - attr - 1);
|
||||||
|
attr_name = b + 1;
|
||||||
|
}
|
||||||
|
if (!write_str_literal(self, "[")) goto end;
|
||||||
|
if (!write_string_as_json(self, attr_name)) goto end;
|
||||||
|
if (!write_str_literal(self, ",")) goto end;
|
||||||
|
if (!write_string_as_json(self, val)) goto end;
|
||||||
|
if (nsindex > -1) {
|
||||||
|
char buf[32];
|
||||||
|
write_data(self, buf, snprintf(buf, sizeof(buf), ",%d", nsindex));
|
||||||
|
}
|
||||||
|
if (!write_str_literal(self, "]")) goto end;
|
||||||
|
|
||||||
|
end:
|
||||||
|
#if PY_MAJOR_VERSION < 3
|
||||||
|
PyMem_Free(attr); PyMem_Free(val);
|
||||||
|
#endif
|
||||||
|
return PyErr_Occurred() ? false : true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
start_tag(Serializer *self, PyObject *args) {
|
||||||
|
const char *tag, *text, *tail;
|
||||||
|
PyObject *items;
|
||||||
|
#if PY_MAJOR_VERSION > 2
|
||||||
|
if (!PyArg_ParseTuple(args, "zzzO!", &tag, &text, &tail, &PyList_Type, &items)) return NULL;
|
||||||
|
#else
|
||||||
|
if (!PyArg_ParseTuple(args, "etetetO!", "UTF-8", &tag, "UTF-8", &text, "UTF-8", &tail, &PyList_Type, &items)) return NULL;
|
||||||
|
#endif
|
||||||
|
Py_ssize_t num_attrs = PyList_Size(items);
|
||||||
|
const char *b = strrchr(tag, '}');
|
||||||
|
const char *tag_name = tag;
|
||||||
|
int nsindex = -1;
|
||||||
|
if (b) {
|
||||||
|
nsindex = namespace_index(self, tag + 1, b - tag - 1);
|
||||||
|
tag_name = b + 1;
|
||||||
|
}
|
||||||
|
if (!write_str_literal(self, "{\"n\":")) goto end;
|
||||||
|
if (!write_string_as_json(self, tag_name)) goto end;
|
||||||
|
if (nsindex > -1) {
|
||||||
|
char buf[32];
|
||||||
|
write_data(self, buf, snprintf(buf, sizeof(buf), ",\"s\":%d", nsindex));
|
||||||
|
}
|
||||||
|
if (text) {
|
||||||
|
if (!write_str_literal(self, ",\"x\":")) goto end;
|
||||||
|
if (!write_string_as_json(self, text)) goto end;
|
||||||
|
}
|
||||||
|
if (tail) {
|
||||||
|
if (!write_str_literal(self, ",\"l\":")) goto end;
|
||||||
|
if (!write_string_as_json(self, tail)) goto end;
|
||||||
|
}
|
||||||
|
if (num_attrs > 0) {
|
||||||
|
if (!write_str_literal(self, ",\"a\":[")) goto end;
|
||||||
|
for (Py_ssize_t i = 0; i < num_attrs; i++) {
|
||||||
|
if (i) { if (!write_str_literal(self, ",")) goto end; }
|
||||||
|
if (!write_attr(self, PyList_GET_ITEM(items, i))) goto end;
|
||||||
|
}
|
||||||
|
if (!write_str_literal(self, "]")) goto end;
|
||||||
|
}
|
||||||
|
|
||||||
|
end:
|
||||||
|
#if PY_MAJOR_VERSION < 3
|
||||||
|
PyMem_Free(tag); PyMem_Free(text); PyMem_Free(tail);
|
||||||
|
#endif
|
||||||
|
if (PyErr_Occurred()) return NULL;
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
add_comment(Serializer *self, PyObject *args) {
|
||||||
|
const char *text, *tail, *type;
|
||||||
|
#if PY_MAJOR_VERSION > 2
|
||||||
|
if (!PyArg_ParseTuple(args, "zzs", &text, &tail, &type)) return NULL;
|
||||||
|
#else
|
||||||
|
if (!PyArg_ParseTuple(args, "etets", "UTF-8", &text, "UTF-8", &tail, &type)) return NULL;
|
||||||
|
#endif
|
||||||
|
if (!write_str_literal(self, "{\"s\":")) goto end;
|
||||||
|
if (!write_string_as_json(self, type)) goto end;
|
||||||
|
if (text) {
|
||||||
|
if (!write_str_literal(self, ",\"x\":")) goto end;
|
||||||
|
if (!write_string_as_json(self, text)) goto end;
|
||||||
|
}
|
||||||
|
if (tail) {
|
||||||
|
if (!write_str_literal(self, ",\"l\":")) goto end;
|
||||||
|
if (!write_string_as_json(self, tail)) goto end;
|
||||||
|
}
|
||||||
|
if (!write_str_literal(self, "}")) goto end;
|
||||||
|
end:
|
||||||
|
#if PY_MAJOR_VERSION < 3
|
||||||
|
PyMem_Free(text); PyMem_Free(tail);
|
||||||
|
#endif
|
||||||
|
if (PyErr_Occurred()) return NULL;
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
add_nsmap(Serializer *self, PyObject *args) {
|
||||||
|
(void)args;
|
||||||
|
if (!write_str_literal(self, "[")) return NULL;
|
||||||
|
bool is_first = true;
|
||||||
|
for (auto x : *self->nsmap) {
|
||||||
|
if (is_first) is_first = false;
|
||||||
|
else if (!write_str_literal(self, ",")) return NULL;
|
||||||
|
if (!write_string_as_json(self, x.c_str())) return NULL;
|
||||||
|
}
|
||||||
|
if (!write_str_literal(self, "]")) return NULL;
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
done(Serializer *self, PyObject *arg) {
|
done(Serializer *self, PyObject *arg) {
|
||||||
(void)arg;
|
(void)arg;
|
||||||
@ -92,15 +401,24 @@ done(Serializer *self, PyObject *arg) {
|
|||||||
PyObject *ans = self->buf;
|
PyObject *ans = self->buf;
|
||||||
self->buf = NULL;
|
self->buf = NULL;
|
||||||
self->used = 0;
|
self->used = 0;
|
||||||
|
self->nsmap->clear();
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Type definition {{{
|
// Boilerplate {{{
|
||||||
|
|
||||||
static PyMethodDef Serializer_methods[] = {
|
static PyMethodDef Serializer_methods[] = {
|
||||||
|
{"start_tag", (PyCFunction)start_tag, METH_VARARGS,
|
||||||
|
"Start serializing a tag"
|
||||||
|
},
|
||||||
|
{"add_comment", (PyCFunction)add_comment, METH_VARARGS,
|
||||||
|
"Add a comment"
|
||||||
|
},
|
||||||
{"write", (PyCFunction)pywrite, METH_O,
|
{"write", (PyCFunction)pywrite, METH_O,
|
||||||
"Write the specified unicode or bytes object"
|
"Write the specified unicode or bytes object"
|
||||||
},
|
},
|
||||||
|
{"add_nsmap", (PyCFunction)add_nsmap, METH_NOARGS,
|
||||||
|
"Add the namespace map"
|
||||||
|
},
|
||||||
{"done", (PyCFunction)done, METH_NOARGS,
|
{"done", (PyCFunction)done, METH_NOARGS,
|
||||||
"Get the serialized output"
|
"Get the serialized output"
|
||||||
},
|
},
|
||||||
@ -147,7 +465,6 @@ PyTypeObject SerializerType = {
|
|||||||
/* tp_alloc */ 0,
|
/* tp_alloc */ 0,
|
||||||
/* tp_new */ alloc,
|
/* tp_new */ alloc,
|
||||||
};
|
};
|
||||||
// }}}
|
|
||||||
|
|
||||||
static char doc[] = "Serialize HTML as JSON efficiently";
|
static char doc[] = "Serialize HTML as JSON efficiently";
|
||||||
static PyMethodDef methods[] = {
|
static PyMethodDef methods[] = {
|
||||||
@ -194,3 +511,4 @@ CALIBRE_MODINIT_FUNC inithtml_as_json(void) {
|
|||||||
return m;
|
return m;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
// }}}
|
||||||
|
@ -15,6 +15,7 @@ from datetime import datetime
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import count
|
from itertools import count
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
from lxml.etree import Comment
|
||||||
|
|
||||||
from css_parser import replaceUrls
|
from css_parser import replaceUrls
|
||||||
from css_parser.css import CSSRule
|
from css_parser.css import CSSRule
|
||||||
@ -243,10 +244,6 @@ def toc_anchor_map(toc):
|
|||||||
return dict(ans)
|
return dict(ans)
|
||||||
|
|
||||||
|
|
||||||
def serialize_parsed_html(root):
|
|
||||||
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleContainer(ContainerBase):
|
class SimpleContainer(ContainerBase):
|
||||||
|
|
||||||
tweak_mode = True
|
tweak_mode = True
|
||||||
@ -416,7 +413,7 @@ def transform_html(container, name, virtualize_resources, link_uid, link_to_map,
|
|||||||
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
|
link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
|
||||||
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
|
a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
|
||||||
|
|
||||||
shtml = serialize_parsed_html(root)
|
shtml = html_as_json(root)
|
||||||
with container.open(name, 'wb') as f:
|
with container.open(name, 'wb') as f:
|
||||||
f.write(shtml)
|
f.write(shtml)
|
||||||
|
|
||||||
@ -757,6 +754,47 @@ def ensure_body(root):
|
|||||||
body.append(div)
|
body.append(div)
|
||||||
|
|
||||||
|
|
||||||
|
def html_as_json(root):
|
||||||
|
try:
|
||||||
|
Serializer = plugins['html_as_json'][0].Serializer
|
||||||
|
except KeyError:
|
||||||
|
return as_bytes(json.dumps(html_as_dict(root), ensure_ascii=False, separators=(',', ':')))
|
||||||
|
s = Serializer()
|
||||||
|
s.write(b'{"version":1,"tree":')
|
||||||
|
stack = [root]
|
||||||
|
|
||||||
|
while stack:
|
||||||
|
elem = stack.pop()
|
||||||
|
if isinstance(elem, bytes):
|
||||||
|
s.write(elem)
|
||||||
|
continue
|
||||||
|
tag = getattr(elem, 'tag', html_as_json)
|
||||||
|
if callable(tag):
|
||||||
|
if tag is Comment:
|
||||||
|
s.add_comment(elem.text, elem.tail, 'c')
|
||||||
|
else:
|
||||||
|
tail = getattr(elem, 'tail', None)
|
||||||
|
if tail:
|
||||||
|
s.add_comment(None, tail, 'o')
|
||||||
|
continue
|
||||||
|
s.start_tag(elem.tag, elem.text, elem.tail, elem.items())
|
||||||
|
children = tuple(elem.iterchildren())
|
||||||
|
if children:
|
||||||
|
s.write(b',"c":[')
|
||||||
|
stack.append(b']}')
|
||||||
|
first_child = children[0]
|
||||||
|
for c in reversed(children):
|
||||||
|
stack.append(c)
|
||||||
|
if c is not first_child:
|
||||||
|
stack.append(b',')
|
||||||
|
else:
|
||||||
|
s.write(b'}')
|
||||||
|
s.write(b',"nsmap":')
|
||||||
|
s.add_nsmap()
|
||||||
|
s.write(b'}')
|
||||||
|
return s.done()
|
||||||
|
|
||||||
|
|
||||||
def html_as_dict(root):
|
def html_as_dict(root):
|
||||||
ensure_body(root)
|
ensure_body(root)
|
||||||
for child in tuple(root.iterchildren('*')):
|
for child in tuple(root.iterchildren('*')):
|
||||||
|
@ -240,6 +240,8 @@ class ContentTest(LibraryBaseTest):
|
|||||||
|
|
||||||
def test_html_as_json(self): # {{{
|
def test_html_as_json(self): # {{{
|
||||||
from calibre.constants import plugins
|
from calibre.constants import plugins
|
||||||
|
from calibre.srv.render_book import html_as_json
|
||||||
|
from calibre.ebooks.oeb.parse_utils import html5_parse
|
||||||
Serializer = plugins['html_as_json'][0].Serializer
|
Serializer = plugins['html_as_json'][0].Serializer
|
||||||
s = Serializer()
|
s = Serializer()
|
||||||
d = 'a' * (127 * 1024)
|
d = 'a' * (127 * 1024)
|
||||||
@ -247,4 +249,23 @@ class ContentTest(LibraryBaseTest):
|
|||||||
d = d.encode('ascii')
|
d = d.encode('ascii')
|
||||||
s.write(d)
|
s.write(d)
|
||||||
self.ae(s.done(), (d + d))
|
self.ae(s.done(), (d + d))
|
||||||
|
|
||||||
|
def t(html, body_children, nsmap=('http://www.w3.org/1999/xhtml',)):
|
||||||
|
root = html5_parse(html)
|
||||||
|
raw = html_as_json(root)
|
||||||
|
# print(raw.decode('utf-8'))
|
||||||
|
data = json.loads(raw)
|
||||||
|
self.ae(data['version'], 1)
|
||||||
|
self.ae(tuple(data['nsmap']), nsmap)
|
||||||
|
bc = data['tree']['c'][1]['c']
|
||||||
|
self.ae(bc, body_children)
|
||||||
|
|
||||||
|
t('<p>a<!--c-->t</p>l', [{"n":"p","s":0,"x":"a","l":"l","c":[{"s":"c","x":"c","l":"t"}]}])
|
||||||
|
t('<p class="foo" id="bar">a', [{"n":"p","s":0,"x":"a","a":[['class','foo'],['id','bar']]}])
|
||||||
|
t(
|
||||||
|
'<svg xlink:href="h"></svg>', [{'n': 'svg', 's': 1, 'a': [['href', 'h', 2]]}],
|
||||||
|
('http://www.w3.org/1999/xhtml', 'http://www.w3.org/2000/svg', 'http://www.w3.org/1999/xlink')
|
||||||
|
)
|
||||||
|
text = '🐈\n\t\\mūs"'
|
||||||
|
t("<p id='{}'>Peña".format(text), [{"n":"p","s":0,"x":"Peña","a":[['id',text]]}])
|
||||||
# }}}
|
# }}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user