diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 5eaf6f69b4..41c650fd88 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -8,7 +8,7 @@ import re import sys import time import warnings -from functools import partial +from functools import lru_cache, partial from math import floor from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type @@ -457,12 +457,11 @@ XML_ENTITIES = { '&' : '&' } -def entity_to_unicode(match, exceptions=(), encoding='cp1252', - result_exceptions={}): +def entity_to_unicode(match, exceptions=(), encoding=None, result_exceptions={}): ''' :param match: A match object such that '&'+match.group(1)';' is the entity. - :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234' + :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234)' :param encoding: The encoding to use to decode numeric entities between 128 and 256. If None, the Unicode UCS encoding is used. A common encoding is cp1252. @@ -472,56 +471,46 @@ def entity_to_unicode(match, exceptions=(), encoding='cp1252', Convenient way to specify exception for things like < or > that can be specified by various actual entities. ''' - def check(ch): - return result_exceptions.get(ch, ch) - - ent = match.group(1) - if ent in exceptions: - return '&'+ent+';' - if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software - return check("'") - if ent == 'hellips': - ent = 'hellip' - if ent.startswith('#'): - try: - if ent[1] in ('x', 'X'): - num = int(ent[2:], 16) - else: - num = int(ent[1:]) - except: - return '&'+ent+';' - if encoding is None or num > 255: - return check(my_unichr(num)) - try: - return check(bytes(bytearray((num,))).decode(encoding)) - except UnicodeDecodeError: - return check(my_unichr(num)) - from calibre.ebooks.html_entities import html5_entities + from calibre.ebooks.html_entities import entity_to_unicode_in_python try: - return check(html5_entities[ent]) - except KeyError: - pass - from polyglot.html_entities import name2codepoint - try: - return check(my_unichr(name2codepoint[ent])) - except KeyError: - return '&'+ent+';' + from calibre_extensions.fast_html_entities import replace_all_entities + except ImportError: # Running from source without updated binaries + return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions) + if not encoding and not exceptions and (not result_exceptions or result_exceptions is XML_ENTITIES): + return replace_all_entities(match.group(), result_exceptions is XML_ENTITIES) + return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions) -_ent_pat = re.compile(r'&(\S+?);') xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES) -def replace_entities(raw, encoding='cp1252'): - return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw) +@lru_cache(2) +def entity_regex(): + return re.compile(r'&(\S+?);') -def xml_replace_entities(raw, encoding='cp1252'): - return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw) +def replace_entities(raw, encoding=None): + if encoding is None: + try: + from calibre_extensions.fast_html_entities import replace_all_entities + replace_all_entities(raw) + except ImportError: # Running from source without updated binaries + pass + return entity_regex().sub(partial(entity_to_unicode, encoding=encoding), raw) + + +def xml_replace_entities(raw, encoding=None): + if encoding is None: + try: + from calibre_extensions.fast_html_entities import replace_all_entities + replace_all_entities(raw, True) + except ImportError: # Running from source without updated binaries + pass + return entity_regex().sub(partial(xml_entity_to_unicode, encoding=encoding), raw) def prepare_string_for_xml(raw, attribute=False): - raw = _ent_pat.sub(entity_to_unicode, raw) + raw = replace_entities(raw) raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') if attribute: raw = raw.replace('"', '"').replace("'", ''') diff --git a/src/calibre/ebooks/BeautifulSoup.py b/src/calibre/ebooks/BeautifulSoup.py index 8f9d951f73..98c4297575 100644 --- a/src/calibre/ebooks/BeautifulSoup.py +++ b/src/calibre/ebooks/BeautifulSoup.py @@ -7,11 +7,12 @@ from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruct def parse_html(markup): - from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode + from calibre import xml_replace_entities + from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, str): markup = strip_encoding_declarations(markup) - markup = substitute_entites(markup) + markup = xml_replace_entities(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 7d016c8bda..682f32f39f 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -9,6 +9,8 @@ import codecs import re import sys +from calibre import xml_replace_entities + _encoding_pats = ( # XML declaration r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', @@ -38,7 +40,6 @@ class LazyEncodingPats: lazy_encoding_pats = LazyEncodingPats() -ENTITY_PATTERN = re.compile(r'&(\S+?);') def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False): @@ -98,11 +99,6 @@ def find_declared_encoding(raw, limit=50*1024): return ans -def substitute_entites(raw): - from calibre import xml_entity_to_unicode - return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) - - _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"} @@ -191,6 +187,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, if strip_encoding_pats: raw = strip_encoding_declarations(raw) if resolve_entities: - raw = substitute_entites(raw) + raw = xml_replace_entities(raw) return raw, encoding diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index 692b8c7e67..048a02c316 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -4,7 +4,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre import _ent_pat, walk, xml_entity_to_unicode +from calibre import walk, xml_replace_entities from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation MD_EXTENSIONS = { @@ -228,7 +228,7 @@ class TXTInput(InputFormatPlugin): txt = txt.decode(ienc, 'replace') # Replace entities - txt = _ent_pat.sub(xml_entity_to_unicode, txt) + txt = xml_replace_entities(txt) # Normalize line endings txt = normalize_line_endings(txt) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 9002b4aee3..83a8a9dadd 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -9,7 +9,7 @@ import json import re from math import ceil -from calibre import as_unicode +from calibre import as_unicode, entity_regex, xml_replace_entities from calibre import xml_entity_to_unicode as convert_entities XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') @@ -62,7 +62,6 @@ def wrap_lines(match): def smarten_punctuation(html, log=None): - from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor from calibre.utils.smartypants import smartyPants preprocessor = HeuristicProcessor(log=log) @@ -75,7 +74,7 @@ def smarten_punctuation(html, log=None): html = smartyPants(html) html = html.replace(start, '') - return substitute_entites(html) + return xml_replace_entities(html) class DocAnalysis: @@ -375,7 +374,7 @@ def html_preprocess_rules(): # Put all sorts of crap into
. This messes up lxml (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), sanitize_head), # Convert all entities, since lxml doesn't handle them well - (re.compile(r'&(\S+?);'), convert_entities), + (entity_regex(), convert_entities), # Remove the ', re.IGNORECASE), ''), ] diff --git a/src/calibre/ebooks/html_entities.py b/src/calibre/ebooks/html_entities.py index 239f09e2f5..6817ac03c8 100644 --- a/src/calibre/ebooks/html_entities.py +++ b/src/calibre/ebooks/html_entities.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2017, Kovid Goyal