Speedup HTML entity replacement by delegating to C code

2025-07-09 03:04:10 -04:00 · 2024-09-13 13:08:22 +05:30 · 2024-09-13 13:08:22 +05:30 · b86ad609b5
commit b86ad609b5
parent f94fbc113a
12 changed files with 95 additions and 77 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -8,7 +8,7 @@ import re
 import sys
 import time
 import warnings
-from functools import partial
+from functools import lru_cache, partial
 from math import floor

 from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
@ -457,12 +457,11 @@ XML_ENTITIES = {
    '&' : '&amp;'
 }

-def entity_to_unicode(match, exceptions=(), encoding='cp1252',
-        result_exceptions={}):
+def entity_to_unicode(match, exceptions=(), encoding=None, result_exceptions={}):
    '''
    :param match: A match object such that '&'+match.group(1)';' is the entity.

-    :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234'
+    :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234)'

    :param encoding: The encoding to use to decode numeric entities between 128 and 256.
    If None, the Unicode UCS encoding is used. A common encoding is cp1252.
@ -472,56 +471,46 @@ def entity_to_unicode(match, exceptions=(), encoding='cp1252',
    Convenient way to specify exception for things like < or > that can be
    specified by various actual entities.
    '''
-    def check(ch):
-        return result_exceptions.get(ch, ch)
-
-    ent = match.group(1)
-    if ent in exceptions:
-        return '&'+ent+';'
-    if ent in {'apos', 'squot'}:  # squot is generated by some broken CMS software
-        return check("'")
-    if ent == 'hellips':
-        ent = 'hellip'
-    if ent.startswith('#'):
-        try:
-            if ent[1] in ('x', 'X'):
-                num = int(ent[2:], 16)
-            else:
-                num = int(ent[1:])
-        except:
-            return '&'+ent+';'
-        if encoding is None or num > 255:
-            return check(my_unichr(num))
-        try:
-            return check(bytes(bytearray((num,))).decode(encoding))
-        except UnicodeDecodeError:
-            return check(my_unichr(num))
-    from calibre.ebooks.html_entities import html5_entities
+    from calibre.ebooks.html_entities import entity_to_unicode_in_python
    try:
-        return check(html5_entities[ent])
-    except KeyError:
-        pass
-    from polyglot.html_entities import name2codepoint
-    try:
-        return check(my_unichr(name2codepoint[ent]))
-    except KeyError:
-        return '&'+ent+';'
+        from calibre_extensions.fast_html_entities import replace_all_entities
+    except ImportError:  # Running from source without updated binaries
+        return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
+    if not encoding and not exceptions and (not result_exceptions or result_exceptions is XML_ENTITIES):
+        return replace_all_entities(match.group(), result_exceptions is XML_ENTITIES)
+    return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)


-_ent_pat = re.compile(r'&(\S+?);')
 xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES)


-def replace_entities(raw, encoding='cp1252'):
-    return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
+@lru_cache(2)
+def entity_regex():
+    return re.compile(r'&(\S+?);')


-def xml_replace_entities(raw, encoding='cp1252'):
-    return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
+def replace_entities(raw, encoding=None):
+    if encoding is None:
+        try:
+            from calibre_extensions.fast_html_entities import replace_all_entities
+            replace_all_entities(raw)
+        except ImportError:  # Running from source without updated binaries
+            pass
+    return entity_regex().sub(partial(entity_to_unicode, encoding=encoding), raw)
+
+
+def xml_replace_entities(raw, encoding=None):
+    if encoding is None:
+        try:
+            from calibre_extensions.fast_html_entities import replace_all_entities
+            replace_all_entities(raw, True)
+        except ImportError:  # Running from source without updated binaries
+            pass
+    return entity_regex().sub(partial(xml_entity_to_unicode, encoding=encoding), raw)


 def prepare_string_for_xml(raw, attribute=False):
-    raw = _ent_pat.sub(entity_to_unicode, raw)
+    raw = replace_entities(raw)
    raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    if attribute:
        raw = raw.replace('"', '&quot;').replace("'", '&apos;')
--- a/src/calibre/ebooks/BeautifulSoup.py
+++ b/src/calibre/ebooks/BeautifulSoup.py
@ -7,11 +7,12 @@ from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruct


 def parse_html(markup):
-    from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode
+    from calibre import xml_replace_entities
+    from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
    from calibre.utils.cleantext import clean_xml_chars
    if isinstance(markup, str):
        markup = strip_encoding_declarations(markup)
-        markup = substitute_entites(markup)
+        markup = xml_replace_entities(markup)
    else:
        markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
    markup = clean_xml_chars(markup)
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@ -9,6 +9,8 @@ import codecs
 import re
 import sys

+from calibre import xml_replace_entities
+
 _encoding_pats = (
    # XML declaration
    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
@ -38,7 +40,6 @@ class LazyEncodingPats:


 lazy_encoding_pats = LazyEncodingPats()
-ENTITY_PATTERN = re.compile(r'&(\S+?);')


 def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
@ -98,11 +99,6 @@ def find_declared_encoding(raw, limit=50*1024):
                return ans


-def substitute_entites(raw):
-    from calibre import xml_entity_to_unicode
-    return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
-
-
 _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}


@ -191,6 +187,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
    if strip_encoding_pats:
        raw = strip_encoding_declarations(raw)
    if resolve_entities:
-        raw = substitute_entites(raw)
+        raw = xml_replace_entities(raw)

    return raw, encoding
--- a/src/calibre/ebooks/conversion/plugins/txt_input.py
+++ b/src/calibre/ebooks/conversion/plugins/txt_input.py
@ -4,7 +4,7 @@ __docformat__ = 'restructuredtext en'

 import os

-from calibre import _ent_pat, walk, xml_entity_to_unicode
+from calibre import walk, xml_replace_entities
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation

 MD_EXTENSIONS = {
@ -228,7 +228,7 @@ class TXTInput(InputFormatPlugin):
        txt = txt.decode(ienc, 'replace')

        # Replace entities
-        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
+        txt = xml_replace_entities(txt)

        # Normalize line endings
        txt = normalize_line_endings(txt)
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -9,7 +9,7 @@ import json
 import re
 from math import ceil

-from calibre import as_unicode
+from calibre import as_unicode, entity_regex, xml_replace_entities
 from calibre import xml_entity_to_unicode as convert_entities

 XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
@ -62,7 +62,6 @@ def wrap_lines(match):


 def smarten_punctuation(html, log=None):
-    from calibre.ebooks.chardet import substitute_entites
    from calibre.ebooks.conversion.utils import HeuristicProcessor
    from calibre.utils.smartypants import smartyPants
    preprocessor = HeuristicProcessor(log=log)
@ -75,7 +74,7 @@ def smarten_punctuation(html, log=None):
    html = smartyPants(html)
    html = html.replace(start, '<!--')
    html = html.replace(stop, '-->')
-    return substitute_entites(html)
+    return xml_replace_entities(html)


 class DocAnalysis:
@ -375,7 +374,7 @@ def html_preprocess_rules():
        # Put all sorts of crap into <head>. This messes up lxml
        (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), sanitize_head),
        # Convert all entities, since lxml doesn't handle them well
-        (re.compile(r'&(\S+?);'), convert_entities),
+        (entity_regex(), convert_entities),
        # Remove the <![if/endif tags inserted by everybody's darling, MS Word
        (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
    ]
--- a/src/calibre/ebooks/html_entities.py
+++ b/src/calibre/ebooks/html_entities.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

+from calibre import my_unichr

 html5_entities = {
 # ENTITY_DATA {{{
@ -2135,6 +2136,43 @@ html5_entities = {
 }


+def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}):
+    def check(ch):
+        return result_exceptions.get(ch, ch)
+
+    ent = match.group(1)
+    if ent in exceptions:
+        return '&'+ent+';'
+    if ent in {'apos', 'squot'}:  # squot is generated by some broken CMS software
+        return check("'")
+    if ent == 'hellips':
+        ent = 'hellip'
+    if ent.startswith('#'):
+        try:
+            if ent[1] in ('x', 'X'):
+                num = int(ent[2:], 16)
+            else:
+                num = int(ent[1:])
+        except:
+            return '&'+ent+';'
+        if encoding is None or num > 255:
+            return check(my_unichr(num))
+        try:
+            return check(bytes(bytearray((num,))).decode(encoding))
+        except UnicodeDecodeError:
+            return check(my_unichr(num))
+    from calibre.ebooks.html_entities import html5_entities
+    try:
+        return check(html5_entities[ent])
+    except KeyError:
+        pass
+    from polyglot.html_entities import name2codepoint
+    try:
+        return check(my_unichr(name2codepoint[ent]))
+    except KeyError:
+        return '&'+ent+';'
+
+
 def find_tests():
    import unittest
    class TestHTMLEntityReplacement(unittest.TestCase):
--- a/src/calibre/ebooks/lrf/html/convert_from.py
+++ b/src/calibre/ebooks/lrf/html/convert_from.py
@ -8,11 +8,10 @@ import re
 import sys
 import tempfile
 from collections import deque
-from functools import partial
 from itertools import chain
 from math import ceil, floor

-from calibre import __appname__, entity_to_unicode, fit_image, force_unicode, preferred_encoding
+from calibre import __appname__, entity_regex, entity_to_unicode, fit_image, force_unicode, preferred_encoding
 from calibre.constants import filesystem_encoding
 from calibre.devices.interface import DevicePlugin as Device
 from calibre.ebooks import ConversionError
@ -123,8 +122,7 @@ class HTMLConverter:
                                    re.IGNORECASE), lambda m: '<br />'),

                        # Replace entities
-                        (re.compile(r'&(\S+?);'), partial(entity_to_unicode,
-                                                           exceptions=['lt', 'gt', 'amp', 'quot'])),
+                        (entity_regex(), entity_to_unicode),
                        # Remove comments from within style tags as they can mess up BeatifulSoup
                        (re.compile(r'(<style.*?</style>)', re.IGNORECASE|re.DOTALL),
                         strip_style_comments),
--- a/src/calibre/ebooks/lrf/objects.py
+++ b/src/calibre/ebooks/lrf/objects.py
@ -7,7 +7,8 @@ import re
 import struct
 import zlib

-from calibre import entity_to_unicode, prepare_string_for_xml
+from calibre import prepare_string_for_xml
+from calibre.ebooks.html_entities import entity_to_unicode_in_python
 from calibre.ebooks.lrf import PRS500_PROFILE, LRFParseError
 from calibre.ebooks.lrf.tags import Tag

@ -711,7 +712,7 @@ class Text(LRFStream):
        s = str(text, "utf-16-le")
        if s:
            s = s.translate(self.text_map)
-            self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
+            self.content.append(self.entity_pattern.sub(entity_to_unicode_in_python, s))

    def end_container(self, tag, stream):
        self.content.append(None)
--- a/src/calibre/ebooks/lrf/pylrs/pylrs.py
+++ b/src/calibre/ebooks/lrf/pylrs/pylrs.py
@ -65,7 +65,7 @@ from .pylrf import (
 DEFAULT_SOURCE_ENCODING = "cp1252"      # default is us-windows character set
 DEFAULT_GENREADING      = "fs"          # default is yes to both lrf and lrs

-from calibre import __appname__, __version__, entity_to_unicode
+from calibre import __appname__, __version__, replace_entities
 from polyglot.builtins import iteritems, native_string_type, string_or_bytes


@ -752,7 +752,7 @@ class TableOfContents:
 class TocLabel:

    def __init__(self, label, textBlock):
-        self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label))
+        self.label = escape(replace_entities(label))
        self.textBlock = textBlock

    def toElement(self, se):
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@ -14,7 +14,7 @@ import textwrap

 from lxml import etree, html

-from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode
+from calibre import guess_type, replace_entities, xml_replace_entities
 from calibre.ebooks import DRMError, unit_convert
 from calibre.ebooks.chardet import strip_encoding_declarations
 from calibre.ebooks.compression.palmdoc import decompress_doc
@ -181,8 +181,7 @@ class MobiReader:
                self.processed_html)

        self.processed_html = strip_encoding_declarations(self.processed_html)
-        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
-            self.processed_html)
+        self.processed_html = xml_replace_entities(self.processed_html)
        image_name_map = self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()
@ -707,7 +706,6 @@ class MobiReader:
            ncx_manifest_entry = 'toc.ncx'
            elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
            tocobj = None
-            ent_pat = re.compile(r'&(\S+?);')
            if elems:
                tocobj = TOC()
                found = False
@ -724,7 +722,7 @@ class MobiReader:
                                    x.xpath('descendant::text()')])
                            except:
                                text = ''
-                            text = ent_pat.sub(entity_to_unicode, text)
+                            text = replace_entities(text)
                            item = tocobj.add_item(toc.partition('#')[0], href[1:],
                                text)
                            item.left_space = int(self.get_left_whitespace(x))
--- a/src/calibre/library/catalogs/epub_mobi_builder.py
+++ b/src/calibre/library/catalogs/epub_mobi_builder.py
@ -19,7 +19,6 @@ from calibre.constants import cache_dir, ismacos
 from calibre.customize.conversion import DummyReporter
 from calibre.customize.ui import output_profiles
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
-from calibre.ebooks.chardet import substitute_entites
 from calibre.ebooks.metadata import author_to_author_sort
 from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
 from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, InvalidGenresSourceFieldException
@ -2670,7 +2669,7 @@ class CatalogBuilder:
                    args[k] = v.decode('utf-8')
            generated_html = P('catalog/template.xhtml',
                    data=True).decode('utf-8').format(**args)
-            generated_html = substitute_entites(generated_html)
+            generated_html = xml_replace_entities(generated_html)
            return BeautifulSoup(generated_html)

        # Generate the template arguments
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -12,7 +12,7 @@ import time
 import traceback
 from builtins import _

-from calibre import entity_to_unicode, force_unicode, strftime
+from calibre import force_unicode, replace_entities, strftime
 from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.utils.date import dt_factory, local_tz, utcnow
 from calibre.utils.logging import default_log
@ -30,9 +30,8 @@ class Article:
        title = force_unicode(title, 'utf-8')
        self._title = clean_xml_chars(title).strip()
        try:
-            self._title = re.sub(r'&(\S+?);',
-                entity_to_unicode, self._title)
-        except:
+            self._title = replace_entities(self._title)
+        except Exception:
            pass
        self._title = clean_ascii_chars(self._title)
        self.url = url