More work on fast entity conversion

This commit is contained in:
Kovid Goyal 2024-09-12 20:48:20 +05:30
parent 806b6657a3
commit aa3b5398e1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 5435 additions and 24 deletions

1
.gitattributes vendored
View File

@ -54,3 +54,4 @@ resources/content-server/index-generated.html linguist-generated=true
resources/editor.js linguist-generated=true resources/editor.js linguist-generated=true
resources/viewer.js linguist-generated=true resources/viewer.js linguist-generated=true
resources/viewer.html linguist-generated=true resources/viewer.html linguist-generated=true
src/calibre/ebooks/html_entities.h linguist-generated=true

View File

@ -140,6 +140,7 @@
}, },
{ {
"name": "fast_html_entities", "name": "fast_html_entities",
"headers": "calibre/ebooks/html_entities.h",
"sources": "calibre/ebooks/html_entities.c" "sources": "calibre/ebooks/html_entities.c"
}, },
{ {

View File

@ -10,6 +10,7 @@
#define _UNICODE #define _UNICODE
#include <Python.h> #include <Python.h>
#include <stdbool.h> #include <stdbool.h>
#include "html_entities.h"
unsigned int unsigned int
encode_utf8(uint32_t ch, char* dest) { encode_utf8(uint32_t ch, char* dest) {
@ -38,11 +39,41 @@ encode_utf8(uint32_t ch, char* dest) {
return 0; return 0;
} }
static uint32_t
parse_base10_integer(const char *input, size_t sz, bool *ok) {
uint32_t ans = 0;
*ok = true;
for (size_t i = 0; i < sz; i++) {
char ch = input[i];
if (ch < '0' || ch > '9') { *ok = false; return 0; }
uint32_t digit = ch - '0';
ans = ans * 10 + digit;
}
return ans;
}
static uint32_t
parse_base16_integer(const char *input, size_t sz, bool *ok) {
uint32_t ans = 0;
*ok = true;
for (size_t i = 0; i < sz; i++) {
char ch = input[i];
uint32_t digit;
if ('a' <= ch && ch <= 'f') digit = 10 + ch - 'a';
else if ('A' <= ch && ch <= 'F') digit = 10 + ch - 'A';
else {
if (ch < '0' || ch > '9') { *ok = false; return 0; }
digit = ch - '0';
}
ans = ans * 10 + digit;
}
return ans;
}
static size_t static size_t
add_entity(const char *entity, size_t elen, char *output) { add_entity(const char *entity, const size_t elen, char *output) {
size_t ans = 0; size_t ans = 0;
char e[64]; if (elen > 64) {
if (elen > sizeof(e) - 1) {
bad_entity: bad_entity:
output[ans++] = '&'; output[ans++] = '&';
memcpy(output + ans, entity, elen); memcpy(output + ans, entity, elen);
@ -55,25 +86,27 @@ bad_entity:
output[ans++] = ';'; output[ans++] = ';';
return ans; return ans;
} }
memcpy(e, entity, elen); if (entity[0] == '#') {
e[elen] = 0;
if (e[0] == '#') {
if (elen < 2) goto bad_entity; if (elen < 2) goto bad_entity;
char *end; uint32_t codepoint = 0;
unsigned long codepoint = ULONG_MAX; bool ok;
if (e[1] == 'x' || e[1] == 'X') { if (entity[1] == 'x' || entity[1] == 'X') {
errno = 0; if (elen < 3) goto bad_entity;
codepoint = strtoul(e + 2, &end, 16); codepoint = parse_base16_integer(entity + 2, elen - 2, &ok);
if (errno || *end) goto bad_entity; if (!ok || !codepoint) goto bad_entity;
} else { } else {
errno = 0; codepoint = parse_base10_integer(entity + 1, elen - 1, &ok);
codepoint = strtoul(e + 1, &end, 10); if (!ok || !codepoint) goto bad_entity;
if (errno || *end) goto bad_entity;
} }
unsigned num = encode_utf8(codepoint, output); unsigned num = encode_utf8(codepoint, output);
if (!num) goto bad_entity; if (!num) goto bad_entity;
return num; return num;
} else { } else {
struct html_entity *s = in_word_set(entity, elen);
if (!s) goto bad_entity;
ans = strlen(s->val);
memcpy(output, s->val, ans);
return ans;
} }
goto bad_entity; goto bad_entity;
} }
@ -136,9 +169,9 @@ replace_entities(PyObject *self, PyObject *const *args, Py_ssize_t nargs) {
size_t output_sz = replace(input, input_sz, output, keep_xml_entities); size_t output_sz = replace(input, input_sz, output, keep_xml_entities);
PyObject *retval; PyObject *retval;
if (PyErr_Occurred()) retval = NULL; if (PyErr_Occurred()) retval = NULL;
if (!output_sz) retval = Py_NewRef(args[0]); else if (!output_sz) retval = Py_NewRef(args[0]);
if (PyUnicode_Check(args[0])) retval = PyUnicode_FromStringAndSize(output, output_sz); else if (PyUnicode_Check(args[0])) retval = PyUnicode_FromStringAndSize(output, output_sz);
retval = PyBytes_FromStringAndSize(output, output_sz); else retval = PyBytes_FromStringAndSize(output, output_sz);
free(output); free(output);
return retval; return retval;
} }

5351
src/calibre/ebooks/html_entities.h generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1105,6 +1105,7 @@ html5_entities = {
'hearts': '', 'hearts': '',
'heartsuit': '', 'heartsuit': '',
'hellip': '', 'hellip': '',
'hellips': '',
'hercon': '', 'hercon': '',
'hfr': '𝔥', 'hfr': '𝔥',
'hksearow': '', 'hksearow': '',
@ -1857,6 +1858,7 @@ html5_entities = {
'square': '', 'square': '',
'squarf': '', 'squarf': '',
'squf': '', 'squf': '',
'squot': "'",
'srarr': '', 'srarr': '',
'sscr': '𝓈', 'sscr': '𝓈',
'ssetmn': '', 'ssetmn': '',
@ -2135,16 +2137,39 @@ html5_entities = {
def generate_entity_lists(): def generate_entity_lists():
import re import re
from html import entities from html import entities as e
entities = {k.rstrip(';'): entities.html5[k] for k in entities.html5} entities = {k.rstrip(';'): e.name2codepoint[k] for k in e.name2codepoint}
entities.update({k.rstrip(';'): e.html5[k] for k in e.html5})
# common misspelled entity names
for k, v in {'apos': "'", 'squot': "'", 'hellips': entities['hellip']}.items():
if k not in entities:
entities[k] = v
lines = [] lines = []
native_lines = '''\
struct html_entity { const char *name, *val; }
%%
'''.splitlines()
def esc_for_c(x):
if x == '\n':
return '\\n'
if x in '''"\\''':
return '\\' + x
return x
for k in sorted(entities): for k in sorted(entities):
lines.append(f" '{k}': {entities[k]!r},") v = entities[k]
lines.append(f" '{k}': {v!r},")
native_lines.append(f'"{esc_for_c(k)}","{esc_for_c(v)}"')
with open(__file__, 'r+b') as f: with open(__file__, 'r+b') as f:
raw = f.read().decode('utf-8') raw = f.read().decode('utf-8')
pat = re.compile(r'^# ENTITY_DATA {{{.+^# }}}', flags=re.M | re.DOTALL) pat = re.compile(r'^# ENTITY_DATA {{{.+?^# }}}', flags=re.M | re.DOTALL)
raw = pat.sub(lambda m: '# ENTITY_DATA {{{\n' + '\n'.join(lines) + '\n# }}}', raw) raw = pat.sub(lambda m: '# ENTITY_DATA {{{\n' + '\n'.join(lines) + '\n# }}}', raw)
f.seek(0), f.truncate() f.seek(0), f.truncate(), f.write(raw.encode('utf-8'))
f.write(raw.encode('utf-8'))
import subprocess
with open(__file__.replace('.py', '.h'), 'wb') as f:
cp = subprocess.run(['gperf', '-t'], input='\n'.join(native_lines).encode(), stdout=f)
if cp.returncode != 0:
raise SystemExit(cp.returncode)