diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py index 90890ac213..6f7762a894 100644 --- a/src/calibre/srv/render_book.py +++ b/src/calibre/srv/render_book.py @@ -20,7 +20,7 @@ from css_parser import replaceUrls from css_parser.css import CSSRule from calibre import detect_ncpus, force_unicode, prepare_string_for_xml -from calibre.constants import iswindows +from calibre.constants import iswindows, plugins from calibre.customize.ui import plugin_for_input_format from calibre.ebooks import parse_css_length from calibre.ebooks.css_transform_rules import StyleDeclaration @@ -57,6 +57,7 @@ from polyglot.urllib import quote, urlparse RENDER_VERSION = 1 BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa +speedup = plugins['speedup'][0] def XPath(expr): @@ -192,22 +193,29 @@ def anchor_map(root): def get_length(root): - strip_space = re.compile(r'\s+') ans = 0 - ignore_tags = frozenset('script style title noscript'.split()) - def count(elem): - num = 0 - tname = elem.tag.rpartition('}')[-1].lower() - if elem.text and tname not in ignore_tags: - num += len(strip_space.sub('', elem.text)) - if elem.tail: - num += len(strip_space.sub('', elem.tail)) - if tname in 'img svg': - num += 1000 - return num + fast = getattr(speedup, 'get_element_char_length', None) + if fast is None: + ignore_tags = frozenset('script style title noscript'.split()) + img_tags = ('img', 'svg') + strip_space = re.compile(r'\s+') - for body in root.iterdescendants(XHTML('body')): + def count(elem): + num = 0 + tname = elem.tag.rpartition('}')[-1].lower() + if elem.text and tname not in ignore_tags: + num += len(strip_space.sub('', elem.text)) + if elem.tail: + num += len(strip_space.sub('', elem.tail)) + if tname in img_tags: + num += 1000 + return num + else: + def count(elem): + return fast(elem.tag, elem.text, elem.tail) + + for body in root.iterchildren(XHTML('body')): ans += count(body) for elem in body.iterdescendants('*'): ans += count(elem) diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py index b36bc37cd5..9cc3122519 100644 --- a/src/calibre/srv/tests/content.py +++ b/src/calibre/srv/tests/content.py @@ -225,3 +225,13 @@ class ContentTest(LibraryBaseTest): self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data) # }}} + + def test_char_count(self): # {{{ + from calibre.srv.render_book import get_length + from calibre.ebooks.oeb.parse_utils import html5_parse + + root = html5_parse('
a b\nc\td\re')
+ self.ae(get_length(root), 5)
+ root = html5_parse('ab')
+ self.ae(get_length(root), 1002)
+ # }}}
diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c
index 86767d5aa0..ca990150d1 100644
--- a/src/calibre/utils/speedup.c
+++ b/src/calibre/utils/speedup.c
@@ -543,6 +543,66 @@ set_thread_name(PyObject *self, PyObject *args) {
#endif
}
+#define char_is_ignored(ch) (ch <= 32)
+
+#if PY_MAJOR_VERSION > 2
+static size_t
+count_chars_in(PyObject *text) {
+ size_t ans = 0;
+ if (PyUnicode_READY(text) != 0) return 0;
+ int kind = PyUnicode_KIND(text);
+ void *data = PyUnicode_DATA(text);
+ Py_ssize_t len = PyUnicode_GET_LENGTH(text);
+ ans = len;
+ for (Py_ssize_t i = 0; i < len; i++) {
+ if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
+ }
+ return ans;
+}
+#else
+static size_t
+count_chars_in(PyObject *text) {
+ size_t ans = 0;
+#define L(data, sz) { \
+ ans = sz; \
+ for (Py_ssize_t i = 0; i < sz; i++) { if (char_is_ignored((data)[i])) ans--; } \
+}
+ if (PyUnicode_Check(text)) {
+ L(PyUnicode_AS_UNICODE(text), PyUnicode_GET_SIZE(text));
+ } else {
+ L(PyBytes_AS_STRING(text), PyBytes_GET_SIZE(text));
+ }
+ return ans;
+#undef L
+}
+#endif
+
+static PyObject*
+get_element_char_length(PyObject *self, PyObject *args) {
+ (void)(self);
+ const char *tag_name;
+ PyObject *text, *tail;
+ if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
+ const char *b = strrchr(tag_name, '}');
+ if (b) tag_name = b + 1;
+ char ltagname[16];
+ const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
+ for (size_t i = 0; i < tag_name_len; i++) {
+ if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
+ else ltagname[i] = tag_name[i];
+ }
+ int is_ignored_tag = 0;
+ size_t ans = 0;
+#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
+ if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) is_ignored_tag = 1;
+ if (EQ(img) || EQ(svg)) ans += 1000;
+#undef EQ
+ if (tail != Py_None) ans += count_chars_in(tail);
+ if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
+ return PyLong_FromSize_t(ans);
+}
+
+
static PyMethodDef speedup_methods[] = {
{"parse_date", speedup_parse_date, METH_VARARGS,
"parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)."
@@ -590,6 +650,10 @@ static PyMethodDef speedup_methods[] = {
"set_thread_name(name)\n\nWrapper for pthread_setname_np"
},
+ {"get_element_char_length", get_element_char_length, METH_VARARGS,
+ "get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
+ },
+
{NULL, NULL, 0, NULL}
};