mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More work on fast entity conversion
This commit is contained in:
parent
806b6657a3
commit
aa3b5398e1
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -54,3 +54,4 @@ resources/content-server/index-generated.html linguist-generated=true
|
||||
resources/editor.js linguist-generated=true
|
||||
resources/viewer.js linguist-generated=true
|
||||
resources/viewer.html linguist-generated=true
|
||||
src/calibre/ebooks/html_entities.h linguist-generated=true
|
||||
|
@ -140,6 +140,7 @@
|
||||
},
|
||||
{
|
||||
"name": "fast_html_entities",
|
||||
"headers": "calibre/ebooks/html_entities.h",
|
||||
"sources": "calibre/ebooks/html_entities.c"
|
||||
},
|
||||
{
|
||||
|
@ -10,6 +10,7 @@
|
||||
#define _UNICODE
|
||||
#include <Python.h>
|
||||
#include <stdbool.h>
|
||||
#include "html_entities.h"
|
||||
|
||||
unsigned int
|
||||
encode_utf8(uint32_t ch, char* dest) {
|
||||
@ -38,11 +39,41 @@ encode_utf8(uint32_t ch, char* dest) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
parse_base10_integer(const char *input, size_t sz, bool *ok) {
|
||||
uint32_t ans = 0;
|
||||
*ok = true;
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
char ch = input[i];
|
||||
if (ch < '0' || ch > '9') { *ok = false; return 0; }
|
||||
uint32_t digit = ch - '0';
|
||||
ans = ans * 10 + digit;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
parse_base16_integer(const char *input, size_t sz, bool *ok) {
|
||||
uint32_t ans = 0;
|
||||
*ok = true;
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
char ch = input[i];
|
||||
uint32_t digit;
|
||||
if ('a' <= ch && ch <= 'f') digit = 10 + ch - 'a';
|
||||
else if ('A' <= ch && ch <= 'F') digit = 10 + ch - 'A';
|
||||
else {
|
||||
if (ch < '0' || ch > '9') { *ok = false; return 0; }
|
||||
digit = ch - '0';
|
||||
}
|
||||
ans = ans * 10 + digit;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
static size_t
|
||||
add_entity(const char *entity, size_t elen, char *output) {
|
||||
add_entity(const char *entity, const size_t elen, char *output) {
|
||||
size_t ans = 0;
|
||||
char e[64];
|
||||
if (elen > sizeof(e) - 1) {
|
||||
if (elen > 64) {
|
||||
bad_entity:
|
||||
output[ans++] = '&';
|
||||
memcpy(output + ans, entity, elen);
|
||||
@ -55,25 +86,27 @@ bad_entity:
|
||||
output[ans++] = ';';
|
||||
return ans;
|
||||
}
|
||||
memcpy(e, entity, elen);
|
||||
e[elen] = 0;
|
||||
if (e[0] == '#') {
|
||||
if (entity[0] == '#') {
|
||||
if (elen < 2) goto bad_entity;
|
||||
char *end;
|
||||
unsigned long codepoint = ULONG_MAX;
|
||||
if (e[1] == 'x' || e[1] == 'X') {
|
||||
errno = 0;
|
||||
codepoint = strtoul(e + 2, &end, 16);
|
||||
if (errno || *end) goto bad_entity;
|
||||
uint32_t codepoint = 0;
|
||||
bool ok;
|
||||
if (entity[1] == 'x' || entity[1] == 'X') {
|
||||
if (elen < 3) goto bad_entity;
|
||||
codepoint = parse_base16_integer(entity + 2, elen - 2, &ok);
|
||||
if (!ok || !codepoint) goto bad_entity;
|
||||
} else {
|
||||
errno = 0;
|
||||
codepoint = strtoul(e + 1, &end, 10);
|
||||
if (errno || *end) goto bad_entity;
|
||||
codepoint = parse_base10_integer(entity + 1, elen - 1, &ok);
|
||||
if (!ok || !codepoint) goto bad_entity;
|
||||
}
|
||||
unsigned num = encode_utf8(codepoint, output);
|
||||
if (!num) goto bad_entity;
|
||||
return num;
|
||||
} else {
|
||||
struct html_entity *s = in_word_set(entity, elen);
|
||||
if (!s) goto bad_entity;
|
||||
ans = strlen(s->val);
|
||||
memcpy(output, s->val, ans);
|
||||
return ans;
|
||||
}
|
||||
goto bad_entity;
|
||||
}
|
||||
@ -136,9 +169,9 @@ replace_entities(PyObject *self, PyObject *const *args, Py_ssize_t nargs) {
|
||||
size_t output_sz = replace(input, input_sz, output, keep_xml_entities);
|
||||
PyObject *retval;
|
||||
if (PyErr_Occurred()) retval = NULL;
|
||||
if (!output_sz) retval = Py_NewRef(args[0]);
|
||||
if (PyUnicode_Check(args[0])) retval = PyUnicode_FromStringAndSize(output, output_sz);
|
||||
retval = PyBytes_FromStringAndSize(output, output_sz);
|
||||
else if (!output_sz) retval = Py_NewRef(args[0]);
|
||||
else if (PyUnicode_Check(args[0])) retval = PyUnicode_FromStringAndSize(output, output_sz);
|
||||
else retval = PyBytes_FromStringAndSize(output, output_sz);
|
||||
free(output);
|
||||
return retval;
|
||||
}
|
||||
|
5351
src/calibre/ebooks/html_entities.h
generated
Normal file
5351
src/calibre/ebooks/html_entities.h
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -1105,6 +1105,7 @@ html5_entities = {
|
||||
'hearts': '♥',
|
||||
'heartsuit': '♥',
|
||||
'hellip': '…',
|
||||
'hellips': '…',
|
||||
'hercon': '⊹',
|
||||
'hfr': '𝔥',
|
||||
'hksearow': '⤥',
|
||||
@ -1857,6 +1858,7 @@ html5_entities = {
|
||||
'square': '□',
|
||||
'squarf': '▪',
|
||||
'squf': '▪',
|
||||
'squot': "'",
|
||||
'srarr': '→',
|
||||
'sscr': '𝓈',
|
||||
'ssetmn': '∖',
|
||||
@ -2135,16 +2137,39 @@ html5_entities = {
|
||||
|
||||
def generate_entity_lists():
|
||||
import re
|
||||
from html import entities
|
||||
entities = {k.rstrip(';'): entities.html5[k] for k in entities.html5}
|
||||
from html import entities as e
|
||||
entities = {k.rstrip(';'): e.name2codepoint[k] for k in e.name2codepoint}
|
||||
entities.update({k.rstrip(';'): e.html5[k] for k in e.html5})
|
||||
# common misspelled entity names
|
||||
for k, v in {'apos': "'", 'squot': "'", 'hellips': entities['hellip']}.items():
|
||||
if k not in entities:
|
||||
entities[k] = v
|
||||
lines = []
|
||||
native_lines = '''\
|
||||
struct html_entity { const char *name, *val; }
|
||||
%%
|
||||
'''.splitlines()
|
||||
|
||||
def esc_for_c(x):
|
||||
if x == '\n':
|
||||
return '\\n'
|
||||
if x in '''"\\''':
|
||||
return '\\' + x
|
||||
return x
|
||||
|
||||
for k in sorted(entities):
|
||||
lines.append(f" '{k}': {entities[k]!r},")
|
||||
v = entities[k]
|
||||
lines.append(f" '{k}': {v!r},")
|
||||
native_lines.append(f'"{esc_for_c(k)}","{esc_for_c(v)}"')
|
||||
|
||||
with open(__file__, 'r+b') as f:
|
||||
raw = f.read().decode('utf-8')
|
||||
pat = re.compile(r'^# ENTITY_DATA {{{.+^# }}}', flags=re.M | re.DOTALL)
|
||||
pat = re.compile(r'^# ENTITY_DATA {{{.+?^# }}}', flags=re.M | re.DOTALL)
|
||||
raw = pat.sub(lambda m: '# ENTITY_DATA {{{\n' + '\n'.join(lines) + '\n# }}}', raw)
|
||||
f.seek(0), f.truncate()
|
||||
f.write(raw.encode('utf-8'))
|
||||
f.seek(0), f.truncate(), f.write(raw.encode('utf-8'))
|
||||
|
||||
import subprocess
|
||||
with open(__file__.replace('.py', '.h'), 'wb') as f:
|
||||
cp = subprocess.run(['gperf', '-t'], input='\n'.join(native_lines).encode(), stdout=f)
|
||||
if cp.returncode != 0:
|
||||
raise SystemExit(cp.returncode)
|
||||
|
Loading…
x
Reference in New Issue
Block a user