Speedup HTML entity replacement by delegating to C code

2025-07-09 03:04:10 -04:00 · 2024-09-13 13:08:22 +05:30 · 2024-09-13 13:08:22 +05:30 · b86ad609b5
commit b86ad609b5
parent f94fbc113a
12 changed files with 95 additions and 77 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -8,7 +8,7 @@ import re
 import sys
 import time
 import warnings
-from functools import partial
+from functools import lru_cache, partial
 from math import floor
 from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
@ -457,12 +457,11 @@ XML_ENTITIES = {
    '&' : '&amp;'
 }
-def entity_to_unicode(match, exceptions=(), encoding='cp1252',
+def entity_to_unicode(match, exceptions=(), encoding=None, result_exceptions={}):
        result_exceptions={}):
    '''
    :param match: A match object such that '&'+match.group(1)';' is the entity.
-    :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234'
+    :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234)'
    :param encoding: The encoding to use to decode numeric entities between 128 and 256.
    If None, the Unicode UCS encoding is used. A common encoding is cp1252.
@ -472,56 +471,46 @@ def entity_to_unicode(match, exceptions=(), encoding='cp1252',
    Convenient way to specify exception for things like < or > that can be
    specified by various actual entities.
    '''
-    def check(ch):
+    from calibre.ebooks.html_entities import entity_to_unicode_in_python
        return result_exceptions.get(ch, ch)
    ent = match.group(1)
    if ent in exceptions:
        return '&'+ent+';'
    if ent in {'apos', 'squot'}:  # squot is generated by some broken CMS software
        return check("'")
    if ent == 'hellips':
        ent = 'hellip'
    if ent.startswith('#'):
        try:
            if ent[1] in ('x', 'X'):
                num = int(ent[2:], 16)
            else:
                num = int(ent[1:])
        except:
            return '&'+ent+';'
        if encoding is None or num > 255:
            return check(my_unichr(num))
        try:
            return check(bytes(bytearray((num,))).decode(encoding))
        except UnicodeDecodeError:
            return check(my_unichr(num))
    from calibre.ebooks.html_entities import html5_entities
    try:
-        return check(html5_entities[ent])
+        from calibre_extensions.fast_html_entities import replace_all_entities
-    except KeyError:
+    except ImportError:  # Running from source without updated binaries
-        pass
+        return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
-    from polyglot.html_entities import name2codepoint
+    if not encoding and not exceptions and (not result_exceptions or result_exceptions is XML_ENTITIES):
-    try:
+        return replace_all_entities(match.group(), result_exceptions is XML_ENTITIES)
-        return check(my_unichr(name2codepoint[ent]))
+    return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
    except KeyError:
        return '&'+ent+';'
 _ent_pat = re.compile(r'&(\S+?);')
 xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES)
-def replace_entities(raw, encoding='cp1252'):
+@lru_cache(2)
-    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
+def entity_regex():
    return re.compile(r'&(\S+?);')
-def xml_replace_entities(raw, encoding='cp1252'):
+def replace_entities(raw, encoding=None):
-    return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
+    if encoding is None:
        try:
            from calibre_extensions.fast_html_entities import replace_all_entities
            replace_all_entities(raw)
        except ImportError:  # Running from source without updated binaries
            pass
    return entity_regex().sub(partial(entity_to_unicode, encoding=encoding), raw)
 def xml_replace_entities(raw, encoding=None):
    if encoding is None:
        try:
            from calibre_extensions.fast_html_entities import replace_all_entities
            replace_all_entities(raw, True)
        except ImportError:  # Running from source without updated binaries
            pass
    return entity_regex().sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
 def prepare_string_for_xml(raw, attribute=False):
-    raw = _ent_pat.sub(entity_to_unicode, raw)
+    raw = replace_entities(raw)
    raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    if attribute:
        raw = raw.replace('"', '&quot;').replace("'", '&apos;')
--- a/src/calibre/ebooks/BeautifulSoup.py
+++ b/src/calibre/ebooks/BeautifulSoup.py
@ -7,11 +7,12 @@ from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruct
 def parse_html(markup):
-    from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode
+    from calibre import xml_replace_entities
    from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
    from calibre.utils.cleantext import clean_xml_chars
    if isinstance(markup, str):
        markup = strip_encoding_declarations(markup)
-        markup = substitute_entites(markup)
+        markup = xml_replace_entities(markup)
    else:
        markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
    markup = clean_xml_chars(markup)
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@ -9,6 +9,8 @@ import codecs
 import re
 import sys
 from calibre import xml_replace_entities
 _encoding_pats = (
    # XML declaration
    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
@ -38,7 +40,6 @@ class LazyEncodingPats:
 lazy_encoding_pats = LazyEncodingPats()
 ENTITY_PATTERN = re.compile(r'&(\S+?);')
 def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
@ -98,11 +99,6 @@ def find_declared_encoding(raw, limit=50*1024):
                return ans
 def substitute_entites(raw):
    from calibre import xml_entity_to_unicode
    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
 _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
@ -191,6 +187,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
    if strip_encoding_pats:
        raw = strip_encoding_declarations(raw)
    if resolve_entities:
-        raw = substitute_entites(raw)
+        raw = xml_replace_entities(raw)
    return raw, encoding
--- a/src/calibre/ebooks/conversion/plugins/txt_input.py
+++ b/src/calibre/ebooks/conversion/plugins/txt_input.py
@ -4,7 +4,7 @@ __docformat__ = 'restructuredtext en'
 import os
-from calibre import _ent_pat, walk, xml_entity_to_unicode
+from calibre import walk, xml_replace_entities
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 MD_EXTENSIONS = {
@ -228,7 +228,7 @@ class TXTInput(InputFormatPlugin):
        txt = txt.decode(ienc, 'replace')
        # Replace entities
-        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+        txt = xml_replace_entities(txt)
        # Normalize line endings
        txt = normalize_line_endings(txt)
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -9,7 +9,7 @@ import json
 import re
 from math import ceil
-from calibre import as_unicode
+from calibre import as_unicode, entity_regex, xml_replace_entities
 from calibre import xml_entity_to_unicode as convert_entities
 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
@ -62,7 +62,6 @@ def wrap_lines(match):
 def smarten_punctuation(html, log=None):
    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    from calibre.utils.smartypants import smartyPants
    preprocessor = HeuristicProcessor(log=log)
@ -75,7 +74,7 @@ def smarten_punctuation(html, log=None):
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
-    return substitute_entites(html)
+    return xml_replace_entities(html)
 class DocAnalysis:
@ -375,7 +374,7 @@ def html_preprocess_rules():
        # Put all sorts of crap into <head>. This messes up lxml
        (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), sanitize_head),
        # Convert all entities, since lxml doesn't handle them well
-        (re.compile(r'&(\S+?);'), convert_entities),
+        (entity_regex(), convert_entities),
        # Remove the <![if/endif tags inserted by everybody's darling, MS Word
        (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
    ]
--- a/src/calibre/ebooks/html_entities.py
+++ b/src/calibre/ebooks/html_entities.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
 from calibre import my_unichr
 html5_entities = {
 # ENTITY_DATA {{{
@ -2135,6 +2136,43 @@ html5_entities = {
 }
 def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}):
    def check(ch):
        return result_exceptions.get(ch, ch)
    ent = match.group(1)
    if ent in exceptions:
        return '&'+ent+';'
    if ent in {'apos', 'squot'}:  # squot is generated by some broken CMS software
        return check("'")
    if ent == 'hellips':
        ent = 'hellip'
    if ent.startswith('#'):
        try:
            if ent[1] in ('x', 'X'):
                num = int(ent[2:], 16)
            else:
                num = int(ent[1:])
        except:
            return '&'+ent+';'
        if encoding is None or num > 255:
            return check(my_unichr(num))
        try:
            return check(bytes(bytearray((num,))).decode(encoding))
        except UnicodeDecodeError:
            return check(my_unichr(num))
    from calibre.ebooks.html_entities import html5_entities
    try:
        return check(html5_entities[ent])
    except KeyError:
        pass
    from polyglot.html_entities import name2codepoint
    try:
        return check(my_unichr(name2codepoint[ent]))
    except KeyError:
        return '&'+ent+';'
 def find_tests():
    import unittest
    class TestHTMLEntityReplacement(unittest.TestCase):
--- a/src/calibre/ebooks/lrf/html/convert_from.py
+++ b/src/calibre/ebooks/lrf/html/convert_from.py
@ -8,11 +8,10 @@ import re
 import sys
 import tempfile
 from collections import deque
 from functools import partial
 from itertools import chain
 from math import ceil, floor
-from calibre import __appname__, entity_to_unicode, fit_image, force_unicode, preferred_encoding
+from calibre import __appname__, entity_regex, entity_to_unicode, fit_image, force_unicode, preferred_encoding
 from calibre.constants import filesystem_encoding
 from calibre.devices.interface import DevicePlugin as Device
 from calibre.ebooks import ConversionError
@ -123,8 +122,7 @@ class HTMLConverter:
                                    re.IGNORECASE), lambda m: '<br />'),
                        # Replace entities
-                        (re.compile(r'&(\S+?);'), partial(entity_to_unicode,
+                        (entity_regex(), entity_to_unicode),
                                                           exceptions=['lt', 'gt', 'amp', 'quot'])),
                        # Remove comments from within style tags as they can mess up BeatifulSoup
                        (re.compile(r'(<style.*?</style>)', re.IGNORECASE|re.DOTALL),
                         strip_style_comments),
--- a/src/calibre/ebooks/lrf/objects.py
+++ b/src/calibre/ebooks/lrf/objects.py
@ -7,7 +7,8 @@ import re
 import struct
 import zlib
-from calibre import entity_to_unicode, prepare_string_for_xml
+from calibre import prepare_string_for_xml
 from calibre.ebooks.html_entities import entity_to_unicode_in_python
 from calibre.ebooks.lrf import PRS500_PROFILE, LRFParseError
 from calibre.ebooks.lrf.tags import Tag
@ -711,7 +712,7 @@ class Text(LRFStream):
        s = str(text, "utf-16-le")
        if s:
            s = s.translate(self.text_map)
-            self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
+            self.content.append(self.entity_pattern.sub(entity_to_unicode_in_python, s))
    def end_container(self, tag, stream):
        self.content.append(None)
--- a/src/calibre/ebooks/lrf/pylrs/pylrs.py
+++ b/src/calibre/ebooks/lrf/pylrs/pylrs.py
@ -65,7 +65,7 @@ from .pylrf import (
 DEFAULT_SOURCE_ENCODING = "cp1252"      # default is us-windows character set
 DEFAULT_GENREADING      = "fs"          # default is yes to both lrf and lrs
-from calibre import __appname__, __version__, entity_to_unicode
+from calibre import __appname__, __version__, replace_entities
 from polyglot.builtins import iteritems, native_string_type, string_or_bytes
@ -752,7 +752,7 @@ class TableOfContents:
 class TocLabel:
    def __init__(self, label, textBlock):
-        self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label))
+        self.label = escape(replace_entities(label))
        self.textBlock = textBlock
    def toElement(self, se):
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -14,7 +14,7 @@ import textwrap
 from lxml import etree, html
-from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode
+from calibre import guess_type, replace_entities, xml_replace_entities
 from calibre.ebooks import DRMError, unit_convert
 from calibre.ebooks.chardet import strip_encoding_declarations
 from calibre.ebooks.compression.palmdoc import decompress_doc
@ -181,8 +181,7 @@ class MobiReader:
                self.processed_html)
        self.processed_html = strip_encoding_declarations(self.processed_html)
-        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
+        self.processed_html = xml_replace_entities(self.processed_html)
            self.processed_html)
        image_name_map = self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()
@ -707,7 +706,6 @@ class MobiReader:
            ncx_manifest_entry = 'toc.ncx'
            elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
            tocobj = None
            ent_pat = re.compile(r'&(\S+?);')
            if elems:
                tocobj = TOC()
                found = False
@ -724,7 +722,7 @@ class MobiReader:
                                    x.xpath('descendant::text()')])
                            except:
                                text = ''
-                            text = ent_pat.sub(entity_to_unicode, text)
+                            text = replace_entities(text)
                            item = tocobj.add_item(toc.partition('#')[0], href[1:],
                                text)
                            item.left_space = int(self.get_left_whitespace(x))
--- a/src/calibre/library/catalogs/epub_mobi_builder.py
+++ b/src/calibre/library/catalogs/epub_mobi_builder.py
@ -19,7 +19,6 @@ from calibre.constants import cache_dir, ismacos
 from calibre.customize.conversion import DummyReporter
 from calibre.customize.ui import output_profiles
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
 from calibre.ebooks.chardet import substitute_entites
 from calibre.ebooks.metadata import author_to_author_sort
 from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
 from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, InvalidGenresSourceFieldException
@ -2670,7 +2669,7 @@ class CatalogBuilder:
                    args[k] = v.decode('utf-8')
            generated_html = P('catalog/template.xhtml',
                    data=True).decode('utf-8').format(**args)
-            generated_html = substitute_entites(generated_html)
+            generated_html = xml_replace_entities(generated_html)
            return BeautifulSoup(generated_html)
        # Generate the template arguments
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -12,7 +12,7 @@ import time
 import traceback
 from builtins import _
-from calibre import entity_to_unicode, force_unicode, strftime
+from calibre import force_unicode, replace_entities, strftime
 from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.utils.date import dt_factory, local_tz, utcnow
 from calibre.utils.logging import default_log
@ -30,9 +30,8 @@ class Article:
        title = force_unicode(title, 'utf-8')
        self._title = clean_xml_chars(title).strip()
        try:
-            self._title = re.sub(r'&(\S+?);',
+            self._title = replace_entities(self._title)
-                entity_to_unicode, self._title)
+        except Exception:
        except:
            pass
        self._title = clean_ascii_chars(self._title)
        self.url = url