diff --git a/src/calibre/ebooks/html_entities.c b/src/calibre/ebooks/html_entities.c index 00181fb117..8b1b9ac9bd 100644 --- a/src/calibre/ebooks/html_entities.c +++ b/src/calibre/ebooks/html_entities.c @@ -65,60 +65,56 @@ parse_base16_integer(const char *input, size_t sz, bool *ok) { if (ch < '0' || ch > '9') { *ok = false; return 0; } digit = ch - '0'; } - ans = ans * 10 + digit; + ans = ans * 16 + digit; } return ans; } -static size_t -add_entity(const char *entity, const size_t elen, char *output) { - size_t ans = 0; - if (elen > 64) { -bad_entity: - output[ans++] = '&'; - memcpy(output + ans, entity, elen); - ans += elen; - output[ans++] = ';'; - return ans; - } - if (!elen) { - output[ans++] = '&'; - output[ans++] = ';'; - return ans; - } +static bool +is_xml_unsafe(uint32_t codepoint) { + return codepoint == '<' || codepoint == '>' || codepoint == '&' || codepoint == '"' || codepoint == '\''; +} + +static ssize_t +convert_entity(const char *entity, const size_t elen, char *output, bool keep_xml_entities) { if (entity[0] == '#') { - if (elen < 2) goto bad_entity; + if (elen < 2) return -1; uint32_t codepoint = 0; - bool ok; + bool ok = false; if (entity[1] == 'x' || entity[1] == 'X') { - if (elen < 3) goto bad_entity; - codepoint = parse_base16_integer(entity + 2, elen - 2, &ok); - if (!ok || !codepoint) goto bad_entity; + if (elen > 2) codepoint = parse_base16_integer(entity + 2, elen - 2, &ok); } else { codepoint = parse_base10_integer(entity + 1, elen - 1, &ok); - if (!ok || !codepoint) goto bad_entity; } - unsigned num = encode_utf8(codepoint, output); - if (!num) goto bad_entity; - return num; - } else { - const struct html_entity *s = in_word_set(entity, elen); - if (!s) goto bad_entity; - ans = strlen(s->val); - memcpy(output, s->val, ans); - return ans; + if (!ok || (keep_xml_entities && is_xml_unsafe(codepoint))) return -1; + return codepoint ? encode_utf8(codepoint, output) : 0; } - goto bad_entity; + const struct html_entity *s = in_word_set(entity, elen); + if (!s) return -1; + size_t ans = strlen(s->val); + if (keep_xml_entities && ans == 1 && is_xml_unsafe(s->val[0])) return -1; + memcpy(output, s->val, ans); + return ans; +} + +static size_t +add_entity(const char *entity, const size_t elen, char *output, bool keep_xml_entities) { + ssize_t ans; + if (elen > 64 || elen < 3 || (ans = convert_entity(entity + 1, elen - 2, output, keep_xml_entities)) < 0) { + memcpy(output, entity, elen); + return elen; + } + return ans; } static size_t -process_entity(const char *input, size_t input_sz, char *output, size_t *output_pos) { +process_entity(const char *input, size_t input_sz, char *output, size_t *output_pos, bool keep_xml_entities) { size_t input_pos = 1; // ignore leading & while (input_pos < input_sz) { char ch = input[input_pos++]; - if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('0' <= ch && ch <= '9') || (ch == '#' && input_pos == 1)); - else if (ch == ';') { *output_pos += add_entity(input, input_pos-1, output + *output_pos); return input_pos; } + if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('0' <= ch && ch <= '9') || (ch == '#' && input_pos == 2)); + else if (ch == ';') { *output_pos += add_entity(input, input_pos, output + *output_pos, keep_xml_entities); return input_pos; } else break; } memcpy(output + *output_pos, input, input_pos); @@ -132,7 +128,10 @@ replace(const char *input, size_t input_sz, char *output, int keep_xml_entities) while (input_pos < input_sz) { const char *p = (const char*)memchr(input + input_pos, '&', input_sz - input_pos); if (p) { - input_pos += process_entity(p, input_sz - (p - input), output, &output_pos); + size_t before_amp = p - (input + input_pos); + memcpy(output + output_pos, input + input_pos, before_amp); + output_pos += before_amp; input_pos += before_amp; + input_pos += process_entity(p, input_sz - (p - input), output, &output_pos, keep_xml_entities); } else { memcpy(output + output_pos, input + input_pos, input_sz - input_pos); output_pos += input_sz - input_pos; diff --git a/src/calibre/ebooks/html_entities.h b/src/calibre/ebooks/html_entities.h index 073326ab4c..42c559732a 100644 --- a/src/calibre/ebooks/html_entities.h +++ b/src/calibre/ebooks/html_entities.h @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: gperf --struct-type --readonly --includes */ +/* Command-line: gperf --struct-type --readonly --includes --compare-strncmp */ /* Computed positions: -k'1-7,10,12,$' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -5344,7 +5344,7 @@ in_word_set (register const char *str, register size_t len) { register const char *s = wordlist[key].name; - if (*str == *s && !strcmp (str + 1, s + 1)) + if (*str == *s && !strncmp (str + 1, s + 1, len - 1) && s[len] == '\0') return &wordlist[key]; } } diff --git a/src/calibre/ebooks/html_entities.py b/src/calibre/ebooks/html_entities.py index 2a1b88859d..8e60cac80b 100644 --- a/src/calibre/ebooks/html_entities.py +++ b/src/calibre/ebooks/html_entities.py @@ -2142,9 +2142,24 @@ def find_tests(): from calibre_extensions.fast_html_entities import replace_entities def t(inp, exp): self.assertEqual(exp, replace_entities(inp), f'Failed for input: {inp!r}') - t('&', '&') + def x(inp, exp): + self.assertEqual(exp, replace_entities(inp, True), f'Failed for input: {inp!r}') + t('aӒb', 'aӒb') t('', '') t('a', 'a') + t('&', '&') + t('&', '&') + t('&', '&') + t('a&;b &#;c', 'a&;b &#;c') + t('<', '<') + t('&<', '&<') + t('a&b<c', 'a&b