mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup counting of chars in tags
This commit is contained in:
parent
df6e5863ec
commit
07037ddeb3
@ -34,13 +34,17 @@ from calibre.utils.logging import default_log
|
|||||||
from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
|
from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
|
||||||
from calibre.utils.short_uuid import uuid4
|
from calibre.utils.short_uuid import uuid4
|
||||||
from calibre_extensions.fast_css_transform import transform_properties
|
from calibre_extensions.fast_css_transform import transform_properties
|
||||||
from calibre_extensions.speedup import get_element_char_length
|
|
||||||
from polyglot.binary import as_base64_unicode as encode_component
|
from polyglot.binary import as_base64_unicode as encode_component
|
||||||
from polyglot.binary import from_base64_bytes
|
from polyglot.binary import from_base64_bytes
|
||||||
from polyglot.binary import from_base64_unicode as decode_component
|
from polyglot.binary import from_base64_unicode as decode_component
|
||||||
from polyglot.builtins import as_bytes, iteritems
|
from polyglot.builtins import as_bytes, iteritems
|
||||||
from polyglot.urllib import quote, urlparse
|
from polyglot.urllib import quote, urlparse
|
||||||
|
|
||||||
|
try:
|
||||||
|
from calibre_extensions.speedup import get_num_of_significant_chars
|
||||||
|
except ImportError: # running from source without updated binary
|
||||||
|
def get_num_of_significant_chars(elem):
|
||||||
|
return len(getattr(elem, 'text', '')) + len(getattr(elem, 'tail', ''))
|
||||||
RENDER_VERSION = 1
|
RENDER_VERSION = 1
|
||||||
|
|
||||||
BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa
|
BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa
|
||||||
@ -142,17 +146,10 @@ def anchor_map(root):
|
|||||||
|
|
||||||
def get_length(root):
|
def get_length(root):
|
||||||
ans = 0
|
ans = 0
|
||||||
|
|
||||||
def count(elem):
|
|
||||||
tag = getattr(elem, 'tag', count)
|
|
||||||
if callable(tag):
|
|
||||||
return get_element_char_length('', None, getattr(elem, 'tail', None))
|
|
||||||
return get_element_char_length(tag, elem.text, elem.tail)
|
|
||||||
|
|
||||||
for body in root.iterchildren(XHTML('body')):
|
for body in root.iterchildren(XHTML('body')):
|
||||||
ans += count(body)
|
ans += get_num_of_significant_chars(body)
|
||||||
for elem in body.iterdescendants():
|
for elem in body.iterdescendants():
|
||||||
ans += count(elem)
|
ans += get_num_of_significant_chars(elem)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,6 +10,20 @@ from calibre_extensions.fast_css_transform import parse_css_number, transform_pr
|
|||||||
|
|
||||||
class TestTransform(SimpleTest):
|
class TestTransform(SimpleTest):
|
||||||
|
|
||||||
|
def test_counting_chars_in_elems(self):
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.polish.parsing import parse
|
||||||
|
from calibre.srv.render_book import get_length
|
||||||
|
def t(html, expected):
|
||||||
|
root = parse(html, force_html5_parse=True)
|
||||||
|
self.assertEqual(expected, get_length(root), etree.tostring(root, encoding=str))
|
||||||
|
t('<p>abc<span>def</span>x yz<svg>howdy', 1014)
|
||||||
|
t('<p>abc<span>def</span>x yz', 9)
|
||||||
|
t('<p>abc<span>def</span><script>x yz', 6)
|
||||||
|
t('<p>abc<span>def</span><style>x yz', 6)
|
||||||
|
t('<p>abc<span>def</span>x yz<img>howdy', 1014)
|
||||||
|
|
||||||
def test_number_parsing(self):
|
def test_number_parsing(self):
|
||||||
for x in '.314 -.314 0.314 0 2 +2 -1 1e2 -3.14E+2 2e-2'.split():
|
for x in '.314 -.314 0.314 0 2 +2 -1 1e2 -3.14E+2 2e-2'.split():
|
||||||
self.ae(parse_css_number(x), ast.literal_eval(x))
|
self.ae(parse_css_number(x), ast.literal_eval(x))
|
||||||
|
@ -489,43 +489,79 @@ set_thread_name(PyObject *self, PyObject *args) {
|
|||||||
|
|
||||||
#define char_is_ignored(ch) (ch <= 32)
|
#define char_is_ignored(ch) (ch <= 32)
|
||||||
|
|
||||||
|
typedef struct udata {
|
||||||
|
void *data; int kind; Py_ssize_t len;
|
||||||
|
} udata;
|
||||||
|
|
||||||
static size_t
|
static size_t
|
||||||
count_chars_in(PyObject *text) {
|
count_chars_in(udata *text) {
|
||||||
size_t ans = 0;
|
size_t ans = text->len;
|
||||||
if (PyUnicode_READY(text) != 0) return 0;
|
for (Py_ssize_t i = 0; i < text->len; i++) if (char_is_ignored(PyUnicode_READ(text->kind, text->data, i))) ans--;
|
||||||
int kind = PyUnicode_KIND(text);
|
|
||||||
void *data = PyUnicode_DATA(text);
|
|
||||||
Py_ssize_t len = PyUnicode_GET_LENGTH(text);
|
|
||||||
ans = len;
|
|
||||||
for (Py_ssize_t i = 0; i < len; i++) {
|
|
||||||
if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
|
|
||||||
}
|
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject*
|
static size_t
|
||||||
get_element_char_length(PyObject *self, PyObject *args) {
|
count_chars(const char *tag_name, Py_ssize_t tag_len, udata *text, udata *tail) {
|
||||||
(void)(self);
|
|
||||||
const char *tag_name;
|
|
||||||
PyObject *text, *tail;
|
|
||||||
if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
|
|
||||||
const char *b = strrchr(tag_name, '}');
|
|
||||||
if (b) tag_name = b + 1;
|
|
||||||
char ltagname[16];
|
|
||||||
const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
|
|
||||||
for (size_t i = 0; i < tag_name_len; i++) {
|
|
||||||
if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
|
|
||||||
else ltagname[i] = tag_name[i];
|
|
||||||
}
|
|
||||||
int is_ignored_tag = 0;
|
|
||||||
size_t ans = 0;
|
size_t ans = 0;
|
||||||
#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
|
int is_ignored_tag = 0;
|
||||||
if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) {
|
char ltagname[16];
|
||||||
is_ignored_tag = 1;
|
if (tag_name) {
|
||||||
} else if (EQ(img) || EQ(svg)) ans += 1000;
|
const char *b = memchr(tag_name, '}', tag_len);
|
||||||
|
if (b) {
|
||||||
|
b++;
|
||||||
|
tag_len -= b - tag_name;
|
||||||
|
tag_name = b;
|
||||||
|
}
|
||||||
|
if (tag_len < sizeof(ltagname)) {
|
||||||
|
memcpy(ltagname, tag_name, tag_len);
|
||||||
|
for (size_t i = 0; i < tag_len; i++) if ('A' <= ltagname[i] && ltagname[i] <= 'Z') ltagname[i] += 32;
|
||||||
|
#define EQ(x) (memcmp(ltagname, #x, tag_len) == 0)
|
||||||
|
switch(ltagname[0]) {
|
||||||
|
case 's':
|
||||||
|
if (EQ(script) || EQ(style)) is_ignored_tag = 1;
|
||||||
|
else if (EQ(svg)) ans += 1000;
|
||||||
|
break;
|
||||||
|
case 'n':
|
||||||
|
if (EQ(noscript)) is_ignored_tag = 1;
|
||||||
|
break;
|
||||||
|
case 't':
|
||||||
|
if (EQ(title)) is_ignored_tag = 1;
|
||||||
|
break;
|
||||||
|
case 'i':
|
||||||
|
if (EQ(img)) ans += 1000;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#undef EQ
|
#undef EQ
|
||||||
if (tail != Py_None) ans += count_chars_in(tail);
|
ans += count_chars_in(tail);
|
||||||
if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
|
if (!is_ignored_tag) ans += count_chars_in(text);
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
get_num_of_significant_chars(PyObject *self, PyObject *elem) {
|
||||||
|
(void)(self);
|
||||||
|
const char *tag_name = NULL;
|
||||||
|
Py_ssize_t tag_len = 0;
|
||||||
|
PyObject *ptn = PyObject_GetAttrString(elem, "tag"), *text = NULL;
|
||||||
|
if (ptn && PyUnicode_Check(ptn)) tag_name = PyUnicode_AsUTF8AndSize(ptn, &tag_len);
|
||||||
|
udata xdata = {0}, tdata = {0};
|
||||||
|
if (tag_name) {
|
||||||
|
text = PyObject_GetAttrString(elem, "text");
|
||||||
|
if (text && PyUnicode_Check(text)) {
|
||||||
|
xdata.len = PyUnicode_GET_LENGTH(text); xdata.kind = PyUnicode_KIND(text); xdata.data = PyUnicode_DATA(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PyObject *tail = PyObject_GetAttrString(elem, "tail");
|
||||||
|
if (tail && PyUnicode_Check(tail)) {
|
||||||
|
tdata.len = PyUnicode_GET_LENGTH(tail); tdata.kind = PyUnicode_KIND(tail); tdata.data = PyUnicode_DATA(tail);
|
||||||
|
}
|
||||||
|
size_t ans;
|
||||||
|
Py_BEGIN_ALLOW_THREADS
|
||||||
|
ans = count_chars(tag_name, tag_len, &xdata, &tdata);
|
||||||
|
Py_END_ALLOW_THREADS;
|
||||||
|
Py_XDECREF(ptn); Py_XDECREF(text); Py_XDECREF(tail);
|
||||||
return PyLong_FromSize_t(ans);
|
return PyLong_FromSize_t(ans);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -693,8 +729,8 @@ static PyMethodDef speedup_methods[] = {
|
|||||||
"set_thread_name(name)\n\nWrapper for pthread_setname_np"
|
"set_thread_name(name)\n\nWrapper for pthread_setname_np"
|
||||||
},
|
},
|
||||||
|
|
||||||
{"get_element_char_length", get_element_char_length, METH_VARARGS,
|
{"get_num_of_significant_chars", get_num_of_significant_chars, METH_O,
|
||||||
"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
|
"get_num_of_significant_chars(elem)\n\nGet the number of chars in specified tag"
|
||||||
},
|
},
|
||||||
|
|
||||||
{NULL, NULL, 0, NULL}
|
{NULL, NULL, 0, NULL}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user