More work on fast entity conversion

This commit is contained in:
Kovid Goyal 2024-09-12 20:48:20 +05:30
parent 806b6657a3
commit aa3b5398e1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
5 changed files with 5435 additions and 24 deletions

1
.gitattributes vendored
View File

@ -54,3 +54,4 @@ resources/content-server/index-generated.html linguist-generated=true
resources/editor.js linguist-generated=true
resources/viewer.js linguist-generated=true
resources/viewer.html linguist-generated=true
src/calibre/ebooks/html_entities.h linguist-generated=true

View File

@ -140,6 +140,7 @@
},
{
"name": "fast_html_entities",
"headers": "calibre/ebooks/html_entities.h",
"sources": "calibre/ebooks/html_entities.c"
},
{

View File

@ -10,6 +10,7 @@
#define _UNICODE
#include <Python.h>
#include <stdbool.h>
#include "html_entities.h"
unsigned int
encode_utf8(uint32_t ch, char* dest) {
@ -38,11 +39,41 @@ encode_utf8(uint32_t ch, char* dest) {
return 0;
}
static uint32_t
parse_base10_integer(const char *input, size_t sz, bool *ok) {
uint32_t ans = 0;
*ok = true;
for (size_t i = 0; i < sz; i++) {
char ch = input[i];
if (ch < '0' || ch > '9') { *ok = false; return 0; }
uint32_t digit = ch - '0';
ans = ans * 10 + digit;
}
return ans;
}
static uint32_t
parse_base16_integer(const char *input, size_t sz, bool *ok) {
uint32_t ans = 0;
*ok = true;
for (size_t i = 0; i < sz; i++) {
char ch = input[i];
uint32_t digit;
if ('a' <= ch && ch <= 'f') digit = 10 + ch - 'a';
else if ('A' <= ch && ch <= 'F') digit = 10 + ch - 'A';
else {
if (ch < '0' || ch > '9') { *ok = false; return 0; }
digit = ch - '0';
}
ans = ans * 10 + digit;
}
return ans;
}
static size_t
add_entity(const char *entity, size_t elen, char *output) {
add_entity(const char *entity, const size_t elen, char *output) {
size_t ans = 0;
char e[64];
if (elen > sizeof(e) - 1) {
if (elen > 64) {
bad_entity:
output[ans++] = '&';
memcpy(output + ans, entity, elen);
@ -55,25 +86,27 @@ bad_entity:
output[ans++] = ';';
return ans;
}
memcpy(e, entity, elen);
e[elen] = 0;
if (e[0] == '#') {
if (entity[0] == '#') {
if (elen < 2) goto bad_entity;
char *end;
unsigned long codepoint = ULONG_MAX;
if (e[1] == 'x' || e[1] == 'X') {
errno = 0;
codepoint = strtoul(e + 2, &end, 16);
if (errno || *end) goto bad_entity;
uint32_t codepoint = 0;
bool ok;
if (entity[1] == 'x' || entity[1] == 'X') {
if (elen < 3) goto bad_entity;
codepoint = parse_base16_integer(entity + 2, elen - 2, &ok);
if (!ok || !codepoint) goto bad_entity;
} else {
errno = 0;
codepoint = strtoul(e + 1, &end, 10);
if (errno || *end) goto bad_entity;
codepoint = parse_base10_integer(entity + 1, elen - 1, &ok);
if (!ok || !codepoint) goto bad_entity;
}
unsigned num = encode_utf8(codepoint, output);
if (!num) goto bad_entity;
return num;
} else {
struct html_entity *s = in_word_set(entity, elen);
if (!s) goto bad_entity;
ans = strlen(s->val);
memcpy(output, s->val, ans);
return ans;
}
goto bad_entity;
}
@ -136,9 +169,9 @@ replace_entities(PyObject *self, PyObject *const *args, Py_ssize_t nargs) {
size_t output_sz = replace(input, input_sz, output, keep_xml_entities);
PyObject *retval;
if (PyErr_Occurred()) retval = NULL;
if (!output_sz) retval = Py_NewRef(args[0]);
if (PyUnicode_Check(args[0])) retval = PyUnicode_FromStringAndSize(output, output_sz);
retval = PyBytes_FromStringAndSize(output, output_sz);
else if (!output_sz) retval = Py_NewRef(args[0]);
else if (PyUnicode_Check(args[0])) retval = PyUnicode_FromStringAndSize(output, output_sz);
else retval = PyBytes_FromStringAndSize(output, output_sz);
free(output);
return retval;
}

5351
src/calibre/ebooks/html_entities.h generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1105,6 +1105,7 @@ html5_entities = {
'hearts': '',
'heartsuit': '',
'hellip': '',
'hellips': '',
'hercon': '',
'hfr': '𝔥',
'hksearow': '',
@ -1857,6 +1858,7 @@ html5_entities = {
'square': '',
'squarf': '',
'squf': '',
'squot': "'",
'srarr': '',
'sscr': '𝓈',
'ssetmn': '',
@ -2135,16 +2137,39 @@ html5_entities = {
def generate_entity_lists():
import re
from html import entities
entities = {k.rstrip(';'): entities.html5[k] for k in entities.html5}
from html import entities as e
entities = {k.rstrip(';'): e.name2codepoint[k] for k in e.name2codepoint}
entities.update({k.rstrip(';'): e.html5[k] for k in e.html5})
# common misspelled entity names
for k, v in {'apos': "'", 'squot': "'", 'hellips': entities['hellip']}.items():
if k not in entities:
entities[k] = v
lines = []
native_lines = '''\
struct html_entity { const char *name, *val; }
%%
'''.splitlines()
def esc_for_c(x):
if x == '\n':
return '\\n'
if x in '''"\\''':
return '\\' + x
return x
for k in sorted(entities):
lines.append(f" '{k}': {entities[k]!r},")
v = entities[k]
lines.append(f" '{k}': {v!r},")
native_lines.append(f'"{esc_for_c(k)}","{esc_for_c(v)}"')
with open(__file__, 'r+b') as f:
raw = f.read().decode('utf-8')
pat = re.compile(r'^# ENTITY_DATA {{{.+^# }}}', flags=re.M | re.DOTALL)
pat = re.compile(r'^# ENTITY_DATA {{{.+?^# }}}', flags=re.M | re.DOTALL)
raw = pat.sub(lambda m: '# ENTITY_DATA {{{\n' + '\n'.join(lines) + '\n# }}}', raw)
f.seek(0), f.truncate()
f.write(raw.encode('utf-8'))
f.seek(0), f.truncate(), f.write(raw.encode('utf-8'))
import subprocess
with open(__file__.replace('.py', '.h'), 'wb') as f:
cp = subprocess.run(['gperf', '-t'], input='\n'.join(native_lines).encode(), stdout=f)
if cp.returncode != 0:
raise SystemExit(cp.returncode)