mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Speedup char counting when preparing book
This commit is contained in:
parent
8582154527
commit
a687204ec3
@ -20,7 +20,7 @@ from css_parser import replaceUrls
|
||||
from css_parser.css import CSSRule
|
||||
|
||||
from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
|
||||
from calibre.constants import iswindows
|
||||
from calibre.constants import iswindows, plugins
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
from calibre.ebooks import parse_css_length
|
||||
from calibre.ebooks.css_transform_rules import StyleDeclaration
|
||||
@ -57,6 +57,7 @@ from polyglot.urllib import quote, urlparse
|
||||
RENDER_VERSION = 1
|
||||
|
||||
BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa
|
||||
speedup = plugins['speedup'][0]
|
||||
|
||||
|
||||
def XPath(expr):
|
||||
@ -192,22 +193,29 @@ def anchor_map(root):
|
||||
|
||||
|
||||
def get_length(root):
|
||||
strip_space = re.compile(r'\s+')
|
||||
ans = 0
|
||||
ignore_tags = frozenset('script style title noscript'.split())
|
||||
|
||||
def count(elem):
|
||||
num = 0
|
||||
tname = elem.tag.rpartition('}')[-1].lower()
|
||||
if elem.text and tname not in ignore_tags:
|
||||
num += len(strip_space.sub('', elem.text))
|
||||
if elem.tail:
|
||||
num += len(strip_space.sub('', elem.tail))
|
||||
if tname in 'img svg':
|
||||
num += 1000
|
||||
return num
|
||||
fast = getattr(speedup, 'get_element_char_length', None)
|
||||
if fast is None:
|
||||
ignore_tags = frozenset('script style title noscript'.split())
|
||||
img_tags = ('img', 'svg')
|
||||
strip_space = re.compile(r'\s+')
|
||||
|
||||
for body in root.iterdescendants(XHTML('body')):
|
||||
def count(elem):
|
||||
num = 0
|
||||
tname = elem.tag.rpartition('}')[-1].lower()
|
||||
if elem.text and tname not in ignore_tags:
|
||||
num += len(strip_space.sub('', elem.text))
|
||||
if elem.tail:
|
||||
num += len(strip_space.sub('', elem.tail))
|
||||
if tname in img_tags:
|
||||
num += 1000
|
||||
return num
|
||||
else:
|
||||
def count(elem):
|
||||
return fast(elem.tag, elem.text, elem.tail)
|
||||
|
||||
for body in root.iterchildren(XHTML('body')):
|
||||
ans += count(body)
|
||||
for elem in body.iterdescendants('*'):
|
||||
ans += count(elem)
|
||||
|
@ -225,3 +225,13 @@ class ContentTest(LibraryBaseTest):
|
||||
self.ae(zlib.decompress(raw, 16+zlib.MAX_WBITS), data)
|
||||
|
||||
# }}}
|
||||
|
||||
def test_char_count(self): # {{{
|
||||
from calibre.srv.render_book import get_length
|
||||
from calibre.ebooks.oeb.parse_utils import html5_parse
|
||||
|
||||
root = html5_parse('<p>a b\nc\td\re')
|
||||
self.ae(get_length(root), 5)
|
||||
root = html5_parse('<script>xyz</script>a<iMg>b')
|
||||
self.ae(get_length(root), 1002)
|
||||
# }}}
|
||||
|
@ -543,6 +543,66 @@ set_thread_name(PyObject *self, PyObject *args) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#define char_is_ignored(ch) (ch <= 32)
|
||||
|
||||
#if PY_MAJOR_VERSION > 2
|
||||
static size_t
|
||||
count_chars_in(PyObject *text) {
|
||||
size_t ans = 0;
|
||||
if (PyUnicode_READY(text) != 0) return 0;
|
||||
int kind = PyUnicode_KIND(text);
|
||||
void *data = PyUnicode_DATA(text);
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(text);
|
||||
ans = len;
|
||||
for (Py_ssize_t i = 0; i < len; i++) {
|
||||
if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
#else
|
||||
static size_t
|
||||
count_chars_in(PyObject *text) {
|
||||
size_t ans = 0;
|
||||
#define L(data, sz) { \
|
||||
ans = sz; \
|
||||
for (Py_ssize_t i = 0; i < sz; i++) { if (char_is_ignored((data)[i])) ans--; } \
|
||||
}
|
||||
if (PyUnicode_Check(text)) {
|
||||
L(PyUnicode_AS_UNICODE(text), PyUnicode_GET_SIZE(text));
|
||||
} else {
|
||||
L(PyBytes_AS_STRING(text), PyBytes_GET_SIZE(text));
|
||||
}
|
||||
return ans;
|
||||
#undef L
|
||||
}
|
||||
#endif
|
||||
|
||||
static PyObject*
|
||||
get_element_char_length(PyObject *self, PyObject *args) {
|
||||
(void)(self);
|
||||
const char *tag_name;
|
||||
PyObject *text, *tail;
|
||||
if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
|
||||
const char *b = strrchr(tag_name, '}');
|
||||
if (b) tag_name = b + 1;
|
||||
char ltagname[16];
|
||||
const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
|
||||
for (size_t i = 0; i < tag_name_len; i++) {
|
||||
if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
|
||||
else ltagname[i] = tag_name[i];
|
||||
}
|
||||
int is_ignored_tag = 0;
|
||||
size_t ans = 0;
|
||||
#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
|
||||
if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) is_ignored_tag = 1;
|
||||
if (EQ(img) || EQ(svg)) ans += 1000;
|
||||
#undef EQ
|
||||
if (tail != Py_None) ans += count_chars_in(tail);
|
||||
if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
|
||||
return PyLong_FromSize_t(ans);
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef speedup_methods[] = {
|
||||
{"parse_date", speedup_parse_date, METH_VARARGS,
|
||||
"parse_date()\n\nParse ISO dates faster (specialized for dates stored in the calibre db)."
|
||||
@ -590,6 +650,10 @@ static PyMethodDef speedup_methods[] = {
|
||||
"set_thread_name(name)\n\nWrapper for pthread_setname_np"
|
||||
},
|
||||
|
||||
{"get_element_char_length", get_element_char_length, METH_VARARGS,
|
||||
"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
|
||||
},
|
||||
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user