Speedup char counting when preparing book

2025-07-09 03:04:10 -04:00 · 2019-10-27 12:38:01 +05:30 · 2019-10-27 12:38:01 +05:30 · a687204ec3
commit a687204ec3
parent 8582154527
3 changed files with 96 additions and 14 deletions
--- a/src/calibre/srv/render_book.py
+++ b/src/calibre/srv/render_book.py
@ -20,7 +20,7 @@ from css_parser import replaceUrls
 from css_parser.css import CSSRule

 from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
-from calibre.constants import iswindows
+from calibre.constants import iswindows, plugins
 from calibre.customize.ui import plugin_for_input_format
 from calibre.ebooks import parse_css_length
 from calibre.ebooks.css_transform_rules import StyleDeclaration
@ -57,6 +57,7 @@ from polyglot.urllib import quote, urlparse
 RENDER_VERSION = 1

 BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9'  # noqa
+speedup = plugins['speedup'][0]


 def XPath(expr):
@ -192,22 +193,29 @@ def anchor_map(root):


 def get_length(root):
-    strip_space = re.compile(r'\s+')
    ans = 0
-    ignore_tags = frozenset('script style title noscript'.split())

-    def count(elem):
-        num = 0
-        tname = elem.tag.rpartition('}')[-1].lower()
-        if elem.text and tname not in ignore_tags:
-            num += len(strip_space.sub('', elem.text))
-        if elem.tail:
-            num += len(strip_space.sub('', elem.tail))
-        if tname in 'img svg':
-            num += 1000
-        return num
+    fast = getattr(speedup, 'get_element_char_length', None)
+    if fast is None:
+        ignore_tags = frozenset('script style title noscript'.split())
+        img_tags = ('img', 'svg')
+        strip_space = re.compile(r'\s+')

-    for body in root.iterdescendants(XHTML('body')):
+        def count(elem):
+            num = 0
+            tname = elem.tag.rpartition('}')[-1].lower()
+            if elem.text and tname not in ignore_tags:
+                num += len(strip_space.sub('', elem.text))
+            if elem.tail:
+                num += len(strip_space.sub('', elem.tail))
+            if tname in img_tags:
+                num += 1000
+            return num
+    else:
+        def count(elem):
+            return fast(elem.tag, elem.text, elem.tail)
+
+    for body in root.iterchildren(XHTML('body')):
        ans += count(body)
        for elem in body.iterdescendants('*'):
            ans += count(elem)
--- a/src/calibre/srv/tests/content.py
+++ b/src/calibre/srv/tests/content.py
@ -225,3 +225,13 @@ class ContentTest(LibraryBaseTest):
            self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data)

    # }}}
+
+    def test_char_count(self):  # {{{
+        from calibre.srv.render_book import get_length
+        from calibre.ebooks.oeb.parse_utils import html5_parse
+
+        root = html5_parse('<p>a b\nc\td\re')
+        self.ae(get_length(root), 5)
+        root = html5_parse('<script>xyz</script>a<iMg>b')
+        self.ae(get_length(root), 1002)
+    # }}}
--- a/src/calibre/utils/speedup.c
+++ b/src/calibre/utils/speedup.c
@ -543,6 +543,66 @@ set_thread_name(PyObject *self, PyObject *args) {
 #endif
 }

+#define char_is_ignored(ch) (ch <= 32)
+
+#if PY_MAJOR_VERSION > 2
+static size_t
+count_chars_in(PyObject *text) {
+	size_t ans = 0;
+	if (PyUnicode_READY(text) != 0) return 0;
+	int kind = PyUnicode_KIND(text);
+	void *data = PyUnicode_DATA(text);
+	Py_ssize_t len = PyUnicode_GET_LENGTH(text);
+	ans = len;
+	for (Py_ssize_t i = 0; i < len; i++) {
+		if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
+	}
+	return ans;
+}
+#else
+static size_t
+count_chars_in(PyObject *text) {
+	size_t ans = 0;
+#define L(data, sz) { \
+	ans = sz; \
+	for (Py_ssize_t i = 0; i < sz; i++) { if (char_is_ignored((data)[i])) ans--; } \
+}
+	if (PyUnicode_Check(text)) {
+		L(PyUnicode_AS_UNICODE(text), PyUnicode_GET_SIZE(text));
+	} else {
+		L(PyBytes_AS_STRING(text), PyBytes_GET_SIZE(text));
+	}
+	return ans;
+#undef L
+}
+#endif
+
+static PyObject*
+get_element_char_length(PyObject *self, PyObject *args) {
+	(void)(self);
+	const char *tag_name;
+	PyObject *text, *tail;
+	if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
+	const char *b = strrchr(tag_name, '}');
+	if (b) tag_name = b + 1;
+	char ltagname[16];
+	const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
+	for (size_t i = 0; i < tag_name_len; i++) {
+		if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
+		else ltagname[i] = tag_name[i];
+	}
+	int is_ignored_tag = 0;
+	size_t ans = 0;
+#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
+	if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) is_ignored_tag = 1;
+	if (EQ(img) || EQ(svg)) ans += 1000;
+#undef EQ
+	if (tail != Py_None) ans += count_chars_in(tail);
+	if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
+	return PyLong_FromSize_t(ans);
+}
+
+
 static PyMethodDef speedup_methods[] = {
    {"parse_date", speedup_parse_date, METH_VARARGS,
        "parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)."
@ -590,6 +650,10 @@ static PyMethodDef speedup_methods[] = {
 		"set_thread_name(name)\n\nWrapper for pthread_setname_np"
 	},

+	{"get_element_char_length", get_element_char_length, METH_VARARGS,
+		"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
+	},
+
    {NULL, NULL, 0, NULL}
 };