Speedup HTML entity replacement by delegating to C code

This commit is contained in:
Kovid Goyal 2024-09-13 13:08:22 +05:30
parent f94fbc113a
commit b86ad609b5
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
12 changed files with 95 additions and 77 deletions

View File

@ -8,7 +8,7 @@ import re
import sys import sys
import time import time
import warnings import warnings
from functools import partial from functools import lru_cache, partial
from math import floor from math import floor
from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
@ -457,12 +457,11 @@ XML_ENTITIES = {
'&' : '&' '&' : '&'
} }
def entity_to_unicode(match, exceptions=(), encoding='cp1252', def entity_to_unicode(match, exceptions=(), encoding=None, result_exceptions={}):
result_exceptions={}):
''' '''
:param match: A match object such that '&'+match.group(1)';' is the entity. :param match: A match object such that '&'+match.group(1)';' is the entity.
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234' :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234)'
:param encoding: The encoding to use to decode numeric entities between 128 and 256. :param encoding: The encoding to use to decode numeric entities between 128 and 256.
If None, the Unicode UCS encoding is used. A common encoding is cp1252. If None, the Unicode UCS encoding is used. A common encoding is cp1252.
@ -472,56 +471,46 @@ def entity_to_unicode(match, exceptions=(), encoding='cp1252',
Convenient way to specify exception for things like < or > that can be Convenient way to specify exception for things like < or > that can be
specified by various actual entities. specified by various actual entities.
''' '''
def check(ch): from calibre.ebooks.html_entities import entity_to_unicode_in_python
return result_exceptions.get(ch, ch)
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
return check("'")
if ent == 'hellips':
ent = 'hellip'
if ent.startswith('#'):
try:
if ent[1] in ('x', 'X'):
num = int(ent[2:], 16)
else:
num = int(ent[1:])
except:
return '&'+ent+';'
if encoding is None or num > 255:
return check(my_unichr(num))
try:
return check(bytes(bytearray((num,))).decode(encoding))
except UnicodeDecodeError:
return check(my_unichr(num))
from calibre.ebooks.html_entities import html5_entities
try: try:
return check(html5_entities[ent]) from calibre_extensions.fast_html_entities import replace_all_entities
except KeyError: except ImportError: # Running from source without updated binaries
pass return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
from polyglot.html_entities import name2codepoint if not encoding and not exceptions and (not result_exceptions or result_exceptions is XML_ENTITIES):
try: return replace_all_entities(match.group(), result_exceptions is XML_ENTITIES)
return check(my_unichr(name2codepoint[ent])) return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
except KeyError:
return '&'+ent+';'
_ent_pat = re.compile(r'&(\S+?);')
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES) xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES)
def replace_entities(raw, encoding='cp1252'): @lru_cache(2)
return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw) def entity_regex():
return re.compile(r'&(\S+?);')
def xml_replace_entities(raw, encoding='cp1252'): def replace_entities(raw, encoding=None):
return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw) if encoding is None:
try:
from calibre_extensions.fast_html_entities import replace_all_entities
replace_all_entities(raw)
except ImportError: # Running from source without updated binaries
pass
return entity_regex().sub(partial(entity_to_unicode, encoding=encoding), raw)
def xml_replace_entities(raw, encoding=None):
if encoding is None:
try:
from calibre_extensions.fast_html_entities import replace_all_entities
replace_all_entities(raw, True)
except ImportError: # Running from source without updated binaries
pass
return entity_regex().sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
def prepare_string_for_xml(raw, attribute=False): def prepare_string_for_xml(raw, attribute=False):
raw = _ent_pat.sub(entity_to_unicode, raw) raw = replace_entities(raw)
raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;') raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
if attribute: if attribute:
raw = raw.replace('"', '&quot;').replace("'", '&apos;') raw = raw.replace('"', '&quot;').replace("'", '&apos;')

View File

@ -7,11 +7,12 @@ from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruct
def parse_html(markup): def parse_html(markup):
from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode from calibre import xml_replace_entities
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
if isinstance(markup, str): if isinstance(markup, str):
markup = strip_encoding_declarations(markup) markup = strip_encoding_declarations(markup)
markup = substitute_entites(markup) markup = xml_replace_entities(markup)
else: else:
markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
markup = clean_xml_chars(markup) markup = clean_xml_chars(markup)

View File

@ -9,6 +9,8 @@ import codecs
import re import re
import sys import sys
from calibre import xml_replace_entities
_encoding_pats = ( _encoding_pats = (
# XML declaration # XML declaration
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
@ -38,7 +40,6 @@ class LazyEncodingPats:
lazy_encoding_pats = LazyEncodingPats() lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False): def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
@ -98,11 +99,6 @@ def find_declared_encoding(raw, limit=50*1024):
return ans return ans
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"} _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
@ -191,6 +187,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
if strip_encoding_pats: if strip_encoding_pats:
raw = strip_encoding_declarations(raw) raw = strip_encoding_declarations(raw)
if resolve_entities: if resolve_entities:
raw = substitute_entites(raw) raw = xml_replace_entities(raw)
return raw, encoding return raw, encoding

View File

@ -4,7 +4,7 @@ __docformat__ = 'restructuredtext en'
import os import os
from calibre import _ent_pat, walk, xml_entity_to_unicode from calibre import walk, xml_replace_entities
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
MD_EXTENSIONS = { MD_EXTENSIONS = {
@ -228,7 +228,7 @@ class TXTInput(InputFormatPlugin):
txt = txt.decode(ienc, 'replace') txt = txt.decode(ienc, 'replace')
# Replace entities # Replace entities
txt = _ent_pat.sub(xml_entity_to_unicode, txt) txt = xml_replace_entities(txt)
# Normalize line endings # Normalize line endings
txt = normalize_line_endings(txt) txt = normalize_line_endings(txt)

View File

@ -9,7 +9,7 @@ import json
import re import re
from math import ceil from math import ceil
from calibre import as_unicode from calibre import as_unicode, entity_regex, xml_replace_entities
from calibre import xml_entity_to_unicode as convert_entities from calibre import xml_entity_to_unicode as convert_entities
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
@ -62,7 +62,6 @@ def wrap_lines(match):
def smarten_punctuation(html, log=None): def smarten_punctuation(html, log=None):
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor from calibre.ebooks.conversion.utils import HeuristicProcessor
from calibre.utils.smartypants import smartyPants from calibre.utils.smartypants import smartyPants
preprocessor = HeuristicProcessor(log=log) preprocessor = HeuristicProcessor(log=log)
@ -75,7 +74,7 @@ def smarten_punctuation(html, log=None):
html = smartyPants(html) html = smartyPants(html)
html = html.replace(start, '<!--') html = html.replace(start, '<!--')
html = html.replace(stop, '-->') html = html.replace(stop, '-->')
return substitute_entites(html) return xml_replace_entities(html)
class DocAnalysis: class DocAnalysis:
@ -375,7 +374,7 @@ def html_preprocess_rules():
# Put all sorts of crap into <head>. This messes up lxml # Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), sanitize_head), (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), sanitize_head),
# Convert all entities, since lxml doesn't handle them well # Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities), (entity_regex(), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word # Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''), (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
] ]

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from calibre import my_unichr
html5_entities = { html5_entities = {
# ENTITY_DATA {{{ # ENTITY_DATA {{{
@ -2135,6 +2136,43 @@ html5_entities = {
} }
def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}):
def check(ch):
return result_exceptions.get(ch, ch)
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
return check("'")
if ent == 'hellips':
ent = 'hellip'
if ent.startswith('#'):
try:
if ent[1] in ('x', 'X'):
num = int(ent[2:], 16)
else:
num = int(ent[1:])
except:
return '&'+ent+';'
if encoding is None or num > 255:
return check(my_unichr(num))
try:
return check(bytes(bytearray((num,))).decode(encoding))
except UnicodeDecodeError:
return check(my_unichr(num))
from calibre.ebooks.html_entities import html5_entities
try:
return check(html5_entities[ent])
except KeyError:
pass
from polyglot.html_entities import name2codepoint
try:
return check(my_unichr(name2codepoint[ent]))
except KeyError:
return '&'+ent+';'
def find_tests(): def find_tests():
import unittest import unittest
class TestHTMLEntityReplacement(unittest.TestCase): class TestHTMLEntityReplacement(unittest.TestCase):

View File

@ -8,11 +8,10 @@ import re
import sys import sys
import tempfile import tempfile
from collections import deque from collections import deque
from functools import partial
from itertools import chain from itertools import chain
from math import ceil, floor from math import ceil, floor
from calibre import __appname__, entity_to_unicode, fit_image, force_unicode, preferred_encoding from calibre import __appname__, entity_regex, entity_to_unicode, fit_image, force_unicode, preferred_encoding
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
from calibre.devices.interface import DevicePlugin as Device from calibre.devices.interface import DevicePlugin as Device
from calibre.ebooks import ConversionError from calibre.ebooks import ConversionError
@ -123,8 +122,7 @@ class HTMLConverter:
re.IGNORECASE), lambda m: '<br />'), re.IGNORECASE), lambda m: '<br />'),
# Replace entities # Replace entities
(re.compile(r'&(\S+?);'), partial(entity_to_unicode, (entity_regex(), entity_to_unicode),
exceptions=['lt', 'gt', 'amp', 'quot'])),
# Remove comments from within style tags as they can mess up BeatifulSoup # Remove comments from within style tags as they can mess up BeatifulSoup
(re.compile(r'(<style.*?</style>)', re.IGNORECASE|re.DOTALL), (re.compile(r'(<style.*?</style>)', re.IGNORECASE|re.DOTALL),
strip_style_comments), strip_style_comments),

View File

@ -7,7 +7,8 @@ import re
import struct import struct
import zlib import zlib
from calibre import entity_to_unicode, prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.ebooks.html_entities import entity_to_unicode_in_python
from calibre.ebooks.lrf import PRS500_PROFILE, LRFParseError from calibre.ebooks.lrf import PRS500_PROFILE, LRFParseError
from calibre.ebooks.lrf.tags import Tag from calibre.ebooks.lrf.tags import Tag
@ -711,7 +712,7 @@ class Text(LRFStream):
s = str(text, "utf-16-le") s = str(text, "utf-16-le")
if s: if s:
s = s.translate(self.text_map) s = s.translate(self.text_map)
self.content.append(self.entity_pattern.sub(entity_to_unicode, s)) self.content.append(self.entity_pattern.sub(entity_to_unicode_in_python, s))
def end_container(self, tag, stream): def end_container(self, tag, stream):
self.content.append(None) self.content.append(None)

View File

@ -65,7 +65,7 @@ from .pylrf import (
DEFAULT_SOURCE_ENCODING = "cp1252" # default is us-windows character set DEFAULT_SOURCE_ENCODING = "cp1252" # default is us-windows character set
DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs
from calibre import __appname__, __version__, entity_to_unicode from calibre import __appname__, __version__, replace_entities
from polyglot.builtins import iteritems, native_string_type, string_or_bytes from polyglot.builtins import iteritems, native_string_type, string_or_bytes
@ -752,7 +752,7 @@ class TableOfContents:
class TocLabel: class TocLabel:
def __init__(self, label, textBlock): def __init__(self, label, textBlock):
self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label)) self.label = escape(replace_entities(label))
self.textBlock = textBlock self.textBlock = textBlock
def toElement(self, se): def toElement(self, se):

View File

@ -14,7 +14,7 @@ import textwrap
from lxml import etree, html from lxml import etree, html
from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode from calibre import guess_type, replace_entities, xml_replace_entities
from calibre.ebooks import DRMError, unit_convert from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import strip_encoding_declarations from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
@ -181,8 +181,7 @@ class MobiReader:
self.processed_html) self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html = xml_replace_entities(self.processed_html)
self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir) image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks() self.replace_page_breaks()
self.cleanup_html() self.cleanup_html()
@ -707,7 +706,6 @@ class MobiReader:
ncx_manifest_entry = 'toc.ncx' ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1]) elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if elems: if elems:
tocobj = TOC() tocobj = TOC()
found = False found = False
@ -724,7 +722,7 @@ class MobiReader:
x.xpath('descendant::text()')]) x.xpath('descendant::text()')])
except: except:
text = '' text = ''
text = ent_pat.sub(entity_to_unicode, text) text = replace_entities(text)
item = tocobj.add_item(toc.partition('#')[0], href[1:], item = tocobj.add_item(toc.partition('#')[0], href[1:],
text) text)
item.left_space = int(self.get_left_whitespace(x)) item.left_space = int(self.get_left_whitespace(x))

View File

@ -19,7 +19,6 @@ from calibre.constants import cache_dir, ismacos
from calibre.customize.conversion import DummyReporter from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles from calibre.customize.ui import output_profiles
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.metadata import author_to_author_sort from calibre.ebooks.metadata import author_to_author_sort
from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, InvalidGenresSourceFieldException from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, InvalidGenresSourceFieldException
@ -2670,7 +2669,7 @@ class CatalogBuilder:
args[k] = v.decode('utf-8') args[k] = v.decode('utf-8')
generated_html = P('catalog/template.xhtml', generated_html = P('catalog/template.xhtml',
data=True).decode('utf-8').format(**args) data=True).decode('utf-8').format(**args)
generated_html = substitute_entites(generated_html) generated_html = xml_replace_entities(generated_html)
return BeautifulSoup(generated_html) return BeautifulSoup(generated_html)
# Generate the template arguments # Generate the template arguments

View File

@ -12,7 +12,7 @@ import time
import traceback import traceback
from builtins import _ from builtins import _
from calibre import entity_to_unicode, force_unicode, strftime from calibre import force_unicode, replace_entities, strftime
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.date import dt_factory, local_tz, utcnow from calibre.utils.date import dt_factory, local_tz, utcnow
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
@ -30,9 +30,8 @@ class Article:
title = force_unicode(title, 'utf-8') title = force_unicode(title, 'utf-8')
self._title = clean_xml_chars(title).strip() self._title = clean_xml_chars(title).strip()
try: try:
self._title = re.sub(r'&(\S+?);', self._title = replace_entities(self._title)
entity_to_unicode, self._title) except Exception:
except:
pass pass
self._title = clean_ascii_chars(self._title) self._title = clean_ascii_chars(self._title)
self.url = url self.url = url