Speedup char counting when preparing book

This commit is contained in:
Kovid Goyal 2019-10-27 12:38:01 +05:30
parent 8582154527
commit a687204ec3
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 96 additions and 14 deletions

View File

@ -20,7 +20,7 @@ from css_parser import replaceUrls
from css_parser.css import CSSRule
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
from calibre.constants import iswindows
from calibre.constants import iswindows, plugins
from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks import parse_css_length
from calibre.ebooks.css_transform_rules import StyleDeclaration
@ -57,6 +57,7 @@ from polyglot.urllib import quote, urlparse
RENDER_VERSION = 1
BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa
speedup = plugins['speedup'][0]
def XPath(expr):
@ -192,22 +193,29 @@ def anchor_map(root):
def get_length(root):
strip_space = re.compile(r'\s+')
ans = 0
ignore_tags = frozenset('script style title noscript'.split())
def count(elem):
num = 0
tname = elem.tag.rpartition('}')[-1].lower()
if elem.text and tname not in ignore_tags:
num += len(strip_space.sub('', elem.text))
if elem.tail:
num += len(strip_space.sub('', elem.tail))
if tname in 'img svg':
num += 1000
return num
fast = getattr(speedup, 'get_element_char_length', None)
if fast is None:
ignore_tags = frozenset('script style title noscript'.split())
img_tags = ('img', 'svg')
strip_space = re.compile(r'\s+')
for body in root.iterdescendants(XHTML('body')):
def count(elem):
num = 0
tname = elem.tag.rpartition('}')[-1].lower()
if elem.text and tname not in ignore_tags:
num += len(strip_space.sub('', elem.text))
if elem.tail:
num += len(strip_space.sub('', elem.tail))
if tname in img_tags:
num += 1000
return num
else:
def count(elem):
return fast(elem.tag, elem.text, elem.tail)
for body in root.iterchildren(XHTML('body')):
ans += count(body)
for elem in body.iterdescendants('*'):
ans += count(elem)

View File

@ -225,3 +225,13 @@ class ContentTest(LibraryBaseTest):
self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data)
# }}}
def test_char_count(self): # {{{
from calibre.srv.render_book import get_length
from calibre.ebooks.oeb.parse_utils import html5_parse
root = html5_parse('<p>a b\nc\td\re')
self.ae(get_length(root), 5)
root = html5_parse('<script>xyz</script>a<iMg>b')
self.ae(get_length(root), 1002)
# }}}

View File

@ -543,6 +543,66 @@ set_thread_name(PyObject *self, PyObject *args) {
#endif
}
#define char_is_ignored(ch) (ch <= 32)
#if PY_MAJOR_VERSION > 2
static size_t
count_chars_in(PyObject *text) {
size_t ans = 0;
if (PyUnicode_READY(text) != 0) return 0;
int kind = PyUnicode_KIND(text);
void *data = PyUnicode_DATA(text);
Py_ssize_t len = PyUnicode_GET_LENGTH(text);
ans = len;
for (Py_ssize_t i = 0; i < len; i++) {
if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
}
return ans;
}
#else
static size_t
count_chars_in(PyObject *text) {
size_t ans = 0;
#define L(data, sz) { \
ans = sz; \
for (Py_ssize_t i = 0; i < sz; i++) { if (char_is_ignored((data)[i])) ans--; } \
}
if (PyUnicode_Check(text)) {
L(PyUnicode_AS_UNICODE(text), PyUnicode_GET_SIZE(text));
} else {
L(PyBytes_AS_STRING(text), PyBytes_GET_SIZE(text));
}
return ans;
#undef L
}
#endif
static PyObject*
get_element_char_length(PyObject *self, PyObject *args) {
(void)(self);
const char *tag_name;
PyObject *text, *tail;
if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
const char *b = strrchr(tag_name, '}');
if (b) tag_name = b + 1;
char ltagname[16];
const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
for (size_t i = 0; i < tag_name_len; i++) {
if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
else ltagname[i] = tag_name[i];
}
int is_ignored_tag = 0;
size_t ans = 0;
#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) is_ignored_tag = 1;
if (EQ(img) || EQ(svg)) ans += 1000;
#undef EQ
if (tail != Py_None) ans += count_chars_in(tail);
if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
return PyLong_FromSize_t(ans);
}
static PyMethodDef speedup_methods[] = {
{"parse_date", speedup_parse_date, METH_VARARGS,
"parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)."
@ -590,6 +650,10 @@ static PyMethodDef speedup_methods[] = {
"set_thread_name(name)\n\nWrapper for pthread_setname_np"
},
{"get_element_char_length", get_element_char_length, METH_VARARGS,
"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
},
{NULL, NULL, 0, NULL}
};