diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py index 90890ac213..6f7762a894 100644 --- a/src/calibre/srv/render_book.py +++ b/src/calibre/srv/render_book.py @@ -20,7 +20,7 @@ from css_parser import replaceUrls from css_parser.css import CSSRule from calibre import detect_ncpus, force_unicode, prepare_string_for_xml -from calibre.constants import iswindows +from calibre.constants import iswindows, plugins from calibre.customize.ui import plugin_for_input_format from calibre.ebooks import parse_css_length from calibre.ebooks.css_transform_rules import StyleDeclaration @@ -57,6 +57,7 @@ from polyglot.urllib import quote, urlparse RENDER_VERSION = 1 BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa +speedup = plugins['speedup'][0] def XPath(expr): @@ -192,22 +193,29 @@ def anchor_map(root): def get_length(root): - strip_space = re.compile(r'\s+') ans = 0 - ignore_tags = frozenset('script style title noscript'.split()) - def count(elem): - num = 0 - tname = elem.tag.rpartition('}')[-1].lower() - if elem.text and tname not in ignore_tags: - num += len(strip_space.sub('', elem.text)) - if elem.tail: - num += len(strip_space.sub('', elem.tail)) - if tname in 'img svg': - num += 1000 - return num + fast = getattr(speedup, 'get_element_char_length', None) + if fast is None: + ignore_tags = frozenset('script style title noscript'.split()) + img_tags = ('img', 'svg') + strip_space = re.compile(r'\s+') - for body in root.iterdescendants(XHTML('body')): + def count(elem): + num = 0 + tname = elem.tag.rpartition('}')[-1].lower() + if elem.text and tname not in ignore_tags: + num += len(strip_space.sub('', elem.text)) + if elem.tail: + num += len(strip_space.sub('', elem.tail)) + if tname in img_tags: + num += 1000 + return num + else: + def count(elem): + return fast(elem.tag, elem.text, elem.tail) + + for body in root.iterchildren(XHTML('body')): ans += count(body) for elem in body.iterdescendants('*'): ans += count(elem) diff --git a/src/calibre/srv/tests/content.py b/src/calibre/srv/tests/content.py index b36bc37cd5..9cc3122519 100644 --- a/src/calibre/srv/tests/content.py +++ b/src/calibre/srv/tests/content.py @@ -225,3 +225,13 @@ class ContentTest(LibraryBaseTest): self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data) # }}} + + def test_char_count(self): # {{{ + from calibre.srv.render_book import get_length + from calibre.ebooks.oeb.parse_utils import html5_parse + + root = html5_parse('

a b\nc\td\re') + self.ae(get_length(root), 5) + root = html5_parse('ab') + self.ae(get_length(root), 1002) + # }}} diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c index 86767d5aa0..ca990150d1 100644 --- a/src/calibre/utils/speedup.c +++ b/src/calibre/utils/speedup.c @@ -543,6 +543,66 @@ set_thread_name(PyObject *self, PyObject *args) { #endif } +#define char_is_ignored(ch) (ch <= 32) + +#if PY_MAJOR_VERSION > 2 +static size_t +count_chars_in(PyObject *text) { + size_t ans = 0; + if (PyUnicode_READY(text) != 0) return 0; + int kind = PyUnicode_KIND(text); + void *data = PyUnicode_DATA(text); + Py_ssize_t len = PyUnicode_GET_LENGTH(text); + ans = len; + for (Py_ssize_t i = 0; i < len; i++) { + if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--; + } + return ans; +} +#else +static size_t +count_chars_in(PyObject *text) { + size_t ans = 0; +#define L(data, sz) { \ + ans = sz; \ + for (Py_ssize_t i = 0; i < sz; i++) { if (char_is_ignored((data)[i])) ans--; } \ +} + if (PyUnicode_Check(text)) { + L(PyUnicode_AS_UNICODE(text), PyUnicode_GET_SIZE(text)); + } else { + L(PyBytes_AS_STRING(text), PyBytes_GET_SIZE(text)); + } + return ans; +#undef L +} +#endif + +static PyObject* +get_element_char_length(PyObject *self, PyObject *args) { + (void)(self); + const char *tag_name; + PyObject *text, *tail; + if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL; + const char *b = strrchr(tag_name, '}'); + if (b) tag_name = b + 1; + char ltagname[16]; + const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1); + for (size_t i = 0; i < tag_name_len; i++) { + if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i]; + else ltagname[i] = tag_name[i]; + } + int is_ignored_tag = 0; + size_t ans = 0; +#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0 + if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) is_ignored_tag = 1; + if (EQ(img) || EQ(svg)) ans += 1000; +#undef EQ + if (tail != Py_None) ans += count_chars_in(tail); + if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text); + return PyLong_FromSize_t(ans); +} + + static PyMethodDef speedup_methods[] = { {"parse_date", speedup_parse_date, METH_VARARGS, "parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)." @@ -590,6 +650,10 @@ static PyMethodDef speedup_methods[] = { "set_thread_name(name)\n\nWrapper for pthread_setname_np" }, + {"get_element_char_length", get_element_char_length, METH_VARARGS, + "get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag" + }, + {NULL, NULL, 0, NULL} };