Speedup HTML entity replacement by delegating to C code

This commit is contained in:
Kovid Goyal 2024-09-13 13:08:22 +05:30
parent f94fbc113a
commit b86ad609b5
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
12 changed files with 95 additions and 77 deletions

View File

@ -8,7 +8,7 @@ import re
import sys
import time
import warnings
from functools import partial
from functools import lru_cache, partial
from math import floor
from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
@ -457,12 +457,11 @@ XML_ENTITIES = {
'&' : '&'
}
def entity_to_unicode(match, exceptions=(), encoding='cp1252',
result_exceptions={}):
def entity_to_unicode(match, exceptions=(), encoding=None, result_exceptions={}):
'''
:param match: A match object such that '&'+match.group(1)';' is the entity.
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234'
:param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234)'
:param encoding: The encoding to use to decode numeric entities between 128 and 256.
If None, the Unicode UCS encoding is used. A common encoding is cp1252.
@ -472,56 +471,46 @@ def entity_to_unicode(match, exceptions=(), encoding='cp1252',
Convenient way to specify exception for things like < or > that can be
specified by various actual entities.
'''
def check(ch):
return result_exceptions.get(ch, ch)
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
return check("'")
if ent == 'hellips':
ent = 'hellip'
if ent.startswith('#'):
try:
if ent[1] in ('x', 'X'):
num = int(ent[2:], 16)
else:
num = int(ent[1:])
except:
return '&'+ent+';'
if encoding is None or num > 255:
return check(my_unichr(num))
try:
return check(bytes(bytearray((num,))).decode(encoding))
except UnicodeDecodeError:
return check(my_unichr(num))
from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.html_entities import entity_to_unicode_in_python
try:
return check(html5_entities[ent])
except KeyError:
pass
from polyglot.html_entities import name2codepoint
try:
return check(my_unichr(name2codepoint[ent]))
except KeyError:
return '&'+ent+';'
from calibre_extensions.fast_html_entities import replace_all_entities
except ImportError: # Running from source without updated binaries
return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
if not encoding and not exceptions and (not result_exceptions or result_exceptions is XML_ENTITIES):
return replace_all_entities(match.group(), result_exceptions is XML_ENTITIES)
return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions)
_ent_pat = re.compile(r'&(\S+?);')
xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES)
def replace_entities(raw, encoding='cp1252'):
return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw)
@lru_cache(2)
def entity_regex():
return re.compile(r'&(\S+?);')
def xml_replace_entities(raw, encoding='cp1252'):
return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
def replace_entities(raw, encoding=None):
if encoding is None:
try:
from calibre_extensions.fast_html_entities import replace_all_entities
replace_all_entities(raw)
except ImportError: # Running from source without updated binaries
pass
return entity_regex().sub(partial(entity_to_unicode, encoding=encoding), raw)
def xml_replace_entities(raw, encoding=None):
if encoding is None:
try:
from calibre_extensions.fast_html_entities import replace_all_entities
replace_all_entities(raw, True)
except ImportError: # Running from source without updated binaries
pass
return entity_regex().sub(partial(xml_entity_to_unicode, encoding=encoding), raw)
def prepare_string_for_xml(raw, attribute=False):
raw = _ent_pat.sub(entity_to_unicode, raw)
raw = replace_entities(raw)
raw = raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
if attribute:
raw = raw.replace('"', '&quot;').replace("'", '&apos;')

View File

@ -7,11 +7,12 @@ from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruct
def parse_html(markup):
from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode
from calibre import xml_replace_entities
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(markup, str):
markup = strip_encoding_declarations(markup)
markup = substitute_entites(markup)
markup = xml_replace_entities(markup)
else:
markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
markup = clean_xml_chars(markup)

View File

@ -9,6 +9,8 @@ import codecs
import re
import sys
from calibre import xml_replace_entities
_encoding_pats = (
# XML declaration
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
@ -38,7 +40,6 @@ class LazyEncodingPats:
lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
@ -98,11 +99,6 @@ def find_declared_encoding(raw, limit=50*1024):
return ans
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"}
@ -191,6 +187,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
if strip_encoding_pats:
raw = strip_encoding_declarations(raw)
if resolve_entities:
raw = substitute_entites(raw)
raw = xml_replace_entities(raw)
return raw, encoding

View File

@ -4,7 +4,7 @@ __docformat__ = 'restructuredtext en'
import os
from calibre import _ent_pat, walk, xml_entity_to_unicode
from calibre import walk, xml_replace_entities
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
MD_EXTENSIONS = {
@ -228,7 +228,7 @@ class TXTInput(InputFormatPlugin):
txt = txt.decode(ienc, 'replace')
# Replace entities
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
txt = xml_replace_entities(txt)
# Normalize line endings
txt = normalize_line_endings(txt)

View File

@ -9,7 +9,7 @@ import json
import re
from math import ceil
from calibre import as_unicode
from calibre import as_unicode, entity_regex, xml_replace_entities
from calibre import xml_entity_to_unicode as convert_entities
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
@ -62,7 +62,6 @@ def wrap_lines(match):
def smarten_punctuation(html, log=None):
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
from calibre.utils.smartypants import smartyPants
preprocessor = HeuristicProcessor(log=log)
@ -75,7 +74,7 @@ def smarten_punctuation(html, log=None):
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
return substitute_entites(html)
return xml_replace_entities(html)
class DocAnalysis:
@ -375,7 +374,7 @@ def html_preprocess_rules():
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
(entity_regex(), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
]

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
from calibre import my_unichr
html5_entities = {
# ENTITY_DATA {{{
@ -2135,6 +2136,43 @@ html5_entities = {
}
def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}):
def check(ch):
return result_exceptions.get(ch, ch)
ent = match.group(1)
if ent in exceptions:
return '&'+ent+';'
if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software
return check("'")
if ent == 'hellips':
ent = 'hellip'
if ent.startswith('#'):
try:
if ent[1] in ('x', 'X'):
num = int(ent[2:], 16)
else:
num = int(ent[1:])
except:
return '&'+ent+';'
if encoding is None or num > 255:
return check(my_unichr(num))
try:
return check(bytes(bytearray((num,))).decode(encoding))
except UnicodeDecodeError:
return check(my_unichr(num))
from calibre.ebooks.html_entities import html5_entities
try:
return check(html5_entities[ent])
except KeyError:
pass
from polyglot.html_entities import name2codepoint
try:
return check(my_unichr(name2codepoint[ent]))
except KeyError:
return '&'+ent+';'
def find_tests():
import unittest
class TestHTMLEntityReplacement(unittest.TestCase):

View File

@ -8,11 +8,10 @@ import re
import sys
import tempfile
from collections import deque
from functools import partial
from itertools import chain
from math import ceil, floor
from calibre import __appname__, entity_to_unicode, fit_image, force_unicode, preferred_encoding
from calibre import __appname__, entity_regex, entity_to_unicode, fit_image, force_unicode, preferred_encoding
from calibre.constants import filesystem_encoding
from calibre.devices.interface import DevicePlugin as Device
from calibre.ebooks import ConversionError
@ -123,8 +122,7 @@ class HTMLConverter:
re.IGNORECASE), lambda m: '<br />'),
# Replace entities
(re.compile(r'&(\S+?);'), partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'quot'])),
(entity_regex(), entity_to_unicode),
# Remove comments from within style tags as they can mess up BeatifulSoup
(re.compile(r'(<style.*?</style>)', re.IGNORECASE|re.DOTALL),
strip_style_comments),

View File

@ -7,7 +7,8 @@ import re
import struct
import zlib
from calibre import entity_to_unicode, prepare_string_for_xml
from calibre import prepare_string_for_xml
from calibre.ebooks.html_entities import entity_to_unicode_in_python
from calibre.ebooks.lrf import PRS500_PROFILE, LRFParseError
from calibre.ebooks.lrf.tags import Tag
@ -711,7 +712,7 @@ class Text(LRFStream):
s = str(text, "utf-16-le")
if s:
s = s.translate(self.text_map)
self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
self.content.append(self.entity_pattern.sub(entity_to_unicode_in_python, s))
def end_container(self, tag, stream):
self.content.append(None)

View File

@ -65,7 +65,7 @@ from .pylrf import (
DEFAULT_SOURCE_ENCODING = "cp1252" # default is us-windows character set
DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs
from calibre import __appname__, __version__, entity_to_unicode
from calibre import __appname__, __version__, replace_entities
from polyglot.builtins import iteritems, native_string_type, string_or_bytes
@ -752,7 +752,7 @@ class TableOfContents:
class TocLabel:
def __init__(self, label, textBlock):
self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label))
self.label = escape(replace_entities(label))
self.textBlock = textBlock
def toElement(self, se):

View File

@ -14,7 +14,7 @@ import textwrap
from lxml import etree, html
from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode
from calibre import guess_type, replace_entities, xml_replace_entities
from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.compression.palmdoc import decompress_doc
@ -181,8 +181,7 @@ class MobiReader:
self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
self.processed_html)
self.processed_html = xml_replace_entities(self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
@ -707,7 +706,6 @@ class MobiReader:
ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if elems:
tocobj = TOC()
found = False
@ -724,7 +722,7 @@ class MobiReader:
x.xpath('descendant::text()')])
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
text = replace_entities(text)
item = tocobj.add_item(toc.partition('#')[0], href[1:],
text)
item.left_space = int(self.get_left_whitespace(x))

View File

@ -19,7 +19,6 @@ from calibre.constants import cache_dir, ismacos
from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.metadata import author_to_author_sort
from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, InvalidGenresSourceFieldException
@ -2670,7 +2669,7 @@ class CatalogBuilder:
args[k] = v.decode('utf-8')
generated_html = P('catalog/template.xhtml',
data=True).decode('utf-8').format(**args)
generated_html = substitute_entites(generated_html)
generated_html = xml_replace_entities(generated_html)
return BeautifulSoup(generated_html)
# Generate the template arguments

View File

@ -12,7 +12,7 @@ import time
import traceback
from builtins import _
from calibre import entity_to_unicode, force_unicode, strftime
from calibre import force_unicode, replace_entities, strftime
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.date import dt_factory, local_tz, utcnow
from calibre.utils.logging import default_log
@ -30,9 +30,8 @@ class Article:
title = force_unicode(title, 'utf-8')
self._title = clean_xml_chars(title).strip()
try:
self._title = re.sub(r'&(\S+?);',
entity_to_unicode, self._title)
except:
self._title = replace_entities(self._title)
except Exception:
pass
self._title = clean_ascii_chars(self._title)
self.url = url