Speedup counting of chars in tags

2025-08-30 23:00:21 -04:00 · 2024-09-18 15:47:06 +05:30 · 2024-09-18 15:47:06 +05:30 · 07037ddeb3
commit 07037ddeb3
parent df6e5863ec
3 changed files with 90 additions and 43 deletions
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@ -34,13 +34,17 @@ from calibre.utils.logging import default_log
 from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
 from calibre.utils.short_uuid import uuid4
 from calibre_extensions.fast_css_transform import transform_properties
 from calibre_extensions.speedup import get_element_char_length
 from polyglot.binary import as_base64_unicode as encode_component
 from polyglot.binary import from_base64_bytes
 from polyglot.binary import from_base64_unicode as decode_component
 from polyglot.builtins import as_bytes, iteritems
 from polyglot.urllib import quote, urlparse
 try:
    from calibre_extensions.speedup import get_num_of_significant_chars
 except ImportError:  # running from source without updated binary
    def get_num_of_significant_chars(elem):
        return len(getattr(elem, 'text', '')) + len(getattr(elem, 'tail', ''))
 RENDER_VERSION = 1
 BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9'  # noqa
@ -142,17 +146,10 @@ def anchor_map(root):
 def get_length(root):
    ans = 0
    def count(elem):
        tag = getattr(elem, 'tag', count)
        if callable(tag):
            return get_element_char_length('', None, getattr(elem, 'tail', None))
        return get_element_char_length(tag, elem.text, elem.tail)
    for body in root.iterchildren(XHTML('body')):
-        ans += count(body)
+        ans += get_num_of_significant_chars(body)
        for elem in body.iterdescendants():
-            ans += count(elem)
+            ans += get_num_of_significant_chars(elem)
    return ans
--- a/src/calibre/srv/tests/fast_css_transform.py
+++ b/src/calibre/srv/tests/fast_css_transform.py
@ -10,6 +10,20 @@ from calibre_extensions.fast_css_transform import parse_css_number, transform_pr
 class TestTransform(SimpleTest):
    def test_counting_chars_in_elems(self):
        from lxml import etree
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.srv.render_book import get_length
        def t(html, expected):
            root = parse(html, force_html5_parse=True)
            self.assertEqual(expected, get_length(root), etree.tostring(root, encoding=str))
        t('<p>abc<span>def</span>x yz<svg>howdy', 1014)
        t('<p>abc<span>def</span>x yz', 9)
        t('<p>abc<span>def</span><script>x yz', 6)
        t('<p>abc<span>def</span><style>x yz', 6)
        t('<p>abc<span>def</span>x yz<img>howdy', 1014)
    def test_number_parsing(self):
        for x in '.314 -.314 0.314 0 2 +2 -1 1e2 -3.14E+2 2e-2'.split():
            self.ae(parse_css_number(x), ast.literal_eval(x))
--- a/src/calibre/utils/speedup.c
+++ b/src/calibre/utils/speedup.c
@ -489,43 +489,79 @@ set_thread_name(PyObject *self, PyObject *args) {
 #define char_is_ignored(ch) (ch <= 32)
 typedef struct udata {
    void *data; int kind; Py_ssize_t len;
 } udata;
 static size_t
-count_chars_in(PyObject *text) {
+count_chars_in(udata *text) {
-	size_t ans = 0;
+	size_t ans = text->len;
-	if (PyUnicode_READY(text) != 0) return 0;
+	for (Py_ssize_t i = 0; i < text->len; i++) if (char_is_ignored(PyUnicode_READ(text->kind, text->data, i))) ans--;
 	int kind = PyUnicode_KIND(text);
 	void *data = PyUnicode_DATA(text);
 	Py_ssize_t len = PyUnicode_GET_LENGTH(text);
 	ans = len;
 	for (Py_ssize_t i = 0; i < len; i++) {
 		if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
 	}
 	return ans;
 }
-static PyObject*
+static size_t
-get_element_char_length(PyObject *self, PyObject *args) {
+count_chars(const char *tag_name, Py_ssize_t tag_len, udata *text, udata *tail) {
 	(void)(self);
 	const char *tag_name;
 	PyObject *text, *tail;
 	if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
 	const char *b = strrchr(tag_name, '}');
 	if (b) tag_name = b + 1;
 	char ltagname[16];
 	const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
 	for (size_t i = 0; i < tag_name_len; i++) {
 		if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
 		else ltagname[i] = tag_name[i];
 	}
 	int is_ignored_tag = 0;
 	size_t ans = 0;
-#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
+    int is_ignored_tag = 0;
-	if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) {
+    char ltagname[16];
-        is_ignored_tag = 1;
+    if (tag_name) {
-    } else if (EQ(img) || EQ(svg)) ans += 1000;
+        const char *b = memchr(tag_name, '}', tag_len);
        if (b) {
            b++;
            tag_len -= b - tag_name;
            tag_name = b;
        }
        if (tag_len < sizeof(ltagname)) {
            memcpy(ltagname, tag_name, tag_len);
            for (size_t i = 0; i < tag_len; i++) if ('A' <= ltagname[i] && ltagname[i] <= 'Z') ltagname[i] += 32;
 #define EQ(x) (memcmp(ltagname, #x, tag_len) == 0)
            switch(ltagname[0]) {
                case 's':
                    if (EQ(script) || EQ(style)) is_ignored_tag = 1;
                    else if (EQ(svg)) ans += 1000;
                    break;
                case 'n':
                    if (EQ(noscript)) is_ignored_tag = 1;
                    break;
                case 't':
                    if (EQ(title)) is_ignored_tag = 1;
                    break;
                case 'i':
                    if (EQ(img)) ans += 1000;
                    break;
            }
        }
    }
 #undef EQ
-	if (tail != Py_None) ans += count_chars_in(tail);
+	ans += count_chars_in(tail);
-	if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
+	if (!is_ignored_tag) ans += count_chars_in(text);
    return ans;
 }
 static PyObject*
 get_num_of_significant_chars(PyObject *self, PyObject *elem) {
 	(void)(self);
 	const char *tag_name = NULL;
    Py_ssize_t tag_len = 0;
    PyObject *ptn = PyObject_GetAttrString(elem, "tag"), *text = NULL;
    if (ptn && PyUnicode_Check(ptn)) tag_name = PyUnicode_AsUTF8AndSize(ptn, &tag_len);
    udata xdata = {0}, tdata = {0};
    if (tag_name) {
        text = PyObject_GetAttrString(elem, "text");
        if (text && PyUnicode_Check(text)) {
            xdata.len = PyUnicode_GET_LENGTH(text); xdata.kind = PyUnicode_KIND(text); xdata.data = PyUnicode_DATA(text);
        }
    }
    PyObject *tail = PyObject_GetAttrString(elem, "tail");
    if (tail && PyUnicode_Check(tail)) {
        tdata.len = PyUnicode_GET_LENGTH(tail); tdata.kind = PyUnicode_KIND(tail); tdata.data = PyUnicode_DATA(tail);
    }
    size_t ans;
    Py_BEGIN_ALLOW_THREADS
        ans = count_chars(tag_name, tag_len, &xdata, &tdata);
    Py_END_ALLOW_THREADS;
    Py_XDECREF(ptn); Py_XDECREF(text); Py_XDECREF(tail);
 	return PyLong_FromSize_t(ans);
 }
@ -693,8 +729,8 @@ static PyMethodDef speedup_methods[] = {
 		"set_thread_name(name)\n\nWrapper for pthread_setname_np"
 	},
-	{"get_element_char_length", get_element_char_length, METH_VARARGS,
+	{"get_num_of_significant_chars", get_num_of_significant_chars, METH_O,
-		"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
+		"get_num_of_significant_chars(elem)\n\nGet the number of chars in specified tag"
 	},
    {NULL, NULL, 0, NULL}