diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 5eaf6f69b4..41c650fd88 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -8,7 +8,7 @@ import re import sys import time import warnings -from functools import partial +from functools import lru_cache, partial from math import floor from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type @@ -457,12 +457,11 @@ XML_ENTITIES = { '&' : '&' } -def entity_to_unicode(match, exceptions=(), encoding='cp1252', - result_exceptions={}): +def entity_to_unicode(match, exceptions=(), encoding=None, result_exceptions={}): ''' :param match: A match object such that '&'+match.group(1)';' is the entity. - :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234' + :param exceptions: A list of entities to not convert (Each entry is the name of the entity, e.g. 'apos' or '#1234)' :param encoding: The encoding to use to decode numeric entities between 128 and 256. If None, the Unicode UCS encoding is used. A common encoding is cp1252. @@ -472,56 +471,46 @@ def entity_to_unicode(match, exceptions=(), encoding='cp1252', Convenient way to specify exception for things like < or > that can be specified by various actual entities. ''' - def check(ch): - return result_exceptions.get(ch, ch) - - ent = match.group(1) - if ent in exceptions: - return '&'+ent+';' - if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software - return check("'") - if ent == 'hellips': - ent = 'hellip' - if ent.startswith('#'): - try: - if ent[1] in ('x', 'X'): - num = int(ent[2:], 16) - else: - num = int(ent[1:]) - except: - return '&'+ent+';' - if encoding is None or num > 255: - return check(my_unichr(num)) - try: - return check(bytes(bytearray((num,))).decode(encoding)) - except UnicodeDecodeError: - return check(my_unichr(num)) - from calibre.ebooks.html_entities import html5_entities + from calibre.ebooks.html_entities import entity_to_unicode_in_python try: - return check(html5_entities[ent]) - except KeyError: - pass - from polyglot.html_entities import name2codepoint - try: - return check(my_unichr(name2codepoint[ent])) - except KeyError: - return '&'+ent+';' + from calibre_extensions.fast_html_entities import replace_all_entities + except ImportError: # Running from source without updated binaries + return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions) + if not encoding and not exceptions and (not result_exceptions or result_exceptions is XML_ENTITIES): + return replace_all_entities(match.group(), result_exceptions is XML_ENTITIES) + return entity_to_unicode_in_python(match, exceptions, encoding, result_exceptions) -_ent_pat = re.compile(r'&(\S+?);') xml_entity_to_unicode = partial(entity_to_unicode, result_exceptions=XML_ENTITIES) -def replace_entities(raw, encoding='cp1252'): - return _ent_pat.sub(partial(entity_to_unicode, encoding=encoding), raw) +@lru_cache(2) +def entity_regex(): + return re.compile(r'&(\S+?);') -def xml_replace_entities(raw, encoding='cp1252'): - return _ent_pat.sub(partial(xml_entity_to_unicode, encoding=encoding), raw) +def replace_entities(raw, encoding=None): + if encoding is None: + try: + from calibre_extensions.fast_html_entities import replace_all_entities + replace_all_entities(raw) + except ImportError: # Running from source without updated binaries + pass + return entity_regex().sub(partial(entity_to_unicode, encoding=encoding), raw) + + +def xml_replace_entities(raw, encoding=None): + if encoding is None: + try: + from calibre_extensions.fast_html_entities import replace_all_entities + replace_all_entities(raw, True) + except ImportError: # Running from source without updated binaries + pass + return entity_regex().sub(partial(xml_entity_to_unicode, encoding=encoding), raw) def prepare_string_for_xml(raw, attribute=False): - raw = _ent_pat.sub(entity_to_unicode, raw) + raw = replace_entities(raw) raw = raw.replace('&', '&').replace('<', '<').replace('>', '>') if attribute: raw = raw.replace('"', '"').replace("'", ''') diff --git a/src/calibre/ebooks/BeautifulSoup.py b/src/calibre/ebooks/BeautifulSoup.py index 8f9d951f73..98c4297575 100644 --- a/src/calibre/ebooks/BeautifulSoup.py +++ b/src/calibre/ebooks/BeautifulSoup.py @@ -7,11 +7,12 @@ from bs4 import CData, Comment, Declaration, NavigableString, ProcessingInstruct def parse_html(markup): - from calibre.ebooks.chardet import strip_encoding_declarations, substitute_entites, xml_to_unicode + from calibre import xml_replace_entities + from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, str): markup = strip_encoding_declarations(markup) - markup = substitute_entites(markup) + markup = xml_replace_entities(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 7d016c8bda..682f32f39f 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -9,6 +9,8 @@ import codecs import re import sys +from calibre import xml_replace_entities + _encoding_pats = ( # XML declaration r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', @@ -38,7 +40,6 @@ class LazyEncodingPats: lazy_encoding_pats = LazyEncodingPats() -ENTITY_PATTERN = re.compile(r'&(\S+?);') def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False): @@ -98,11 +99,6 @@ def find_declared_encoding(raw, limit=50*1024): return ans -def substitute_entites(raw): - from calibre import xml_entity_to_unicode - return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) - - _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"} @@ -191,6 +187,6 @@ def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, if strip_encoding_pats: raw = strip_encoding_declarations(raw) if resolve_entities: - raw = substitute_entites(raw) + raw = xml_replace_entities(raw) return raw, encoding diff --git a/src/calibre/ebooks/conversion/plugins/txt_input.py b/src/calibre/ebooks/conversion/plugins/txt_input.py index 692b8c7e67..048a02c316 100644 --- a/src/calibre/ebooks/conversion/plugins/txt_input.py +++ b/src/calibre/ebooks/conversion/plugins/txt_input.py @@ -4,7 +4,7 @@ __docformat__ = 'restructuredtext en' import os -from calibre import _ent_pat, walk, xml_entity_to_unicode +from calibre import walk, xml_replace_entities from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation MD_EXTENSIONS = { @@ -228,7 +228,7 @@ class TXTInput(InputFormatPlugin): txt = txt.decode(ienc, 'replace') # Replace entities - txt = _ent_pat.sub(xml_entity_to_unicode, txt) + txt = xml_replace_entities(txt) # Normalize line endings txt = normalize_line_endings(txt) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 9002b4aee3..83a8a9dadd 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -9,7 +9,7 @@ import json import re from math import ceil -from calibre import as_unicode +from calibre import as_unicode, entity_regex, xml_replace_entities from calibre import xml_entity_to_unicode as convert_entities XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') @@ -62,7 +62,6 @@ def wrap_lines(match): def smarten_punctuation(html, log=None): - from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.conversion.utils import HeuristicProcessor from calibre.utils.smartypants import smartyPants preprocessor = HeuristicProcessor(log=log) @@ -75,7 +74,7 @@ def smarten_punctuation(html, log=None): html = smartyPants(html) html = html.replace(start, '') - return substitute_entites(html) + return xml_replace_entities(html) class DocAnalysis: @@ -375,7 +374,7 @@ def html_preprocess_rules(): # Put all sorts of crap into . This messes up lxml (re.compile(r']*>(.*?)', re.IGNORECASE|re.DOTALL), sanitize_head), # Convert all entities, since lxml doesn't handle them well - (re.compile(r'&(\S+?);'), convert_entities), + (entity_regex(), convert_entities), # Remove the ', re.IGNORECASE), ''), ] diff --git a/src/calibre/ebooks/html_entities.py b/src/calibre/ebooks/html_entities.py index 239f09e2f5..6817ac03c8 100644 --- a/src/calibre/ebooks/html_entities.py +++ b/src/calibre/ebooks/html_entities.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # License: GPLv3 Copyright: 2017, Kovid Goyal +from calibre import my_unichr html5_entities = { # ENTITY_DATA {{{ @@ -2135,6 +2136,43 @@ html5_entities = { } +def entity_to_unicode_in_python(match, exceptions=(), encoding='cp1252', result_exceptions={}): + def check(ch): + return result_exceptions.get(ch, ch) + + ent = match.group(1) + if ent in exceptions: + return '&'+ent+';' + if ent in {'apos', 'squot'}: # squot is generated by some broken CMS software + return check("'") + if ent == 'hellips': + ent = 'hellip' + if ent.startswith('#'): + try: + if ent[1] in ('x', 'X'): + num = int(ent[2:], 16) + else: + num = int(ent[1:]) + except: + return '&'+ent+';' + if encoding is None or num > 255: + return check(my_unichr(num)) + try: + return check(bytes(bytearray((num,))).decode(encoding)) + except UnicodeDecodeError: + return check(my_unichr(num)) + from calibre.ebooks.html_entities import html5_entities + try: + return check(html5_entities[ent]) + except KeyError: + pass + from polyglot.html_entities import name2codepoint + try: + return check(my_unichr(name2codepoint[ent])) + except KeyError: + return '&'+ent+';' + + def find_tests(): import unittest class TestHTMLEntityReplacement(unittest.TestCase): diff --git a/src/calibre/ebooks/lrf/html/convert_from.py b/src/calibre/ebooks/lrf/html/convert_from.py index b0c6aa7351..c4169d9a32 100644 --- a/src/calibre/ebooks/lrf/html/convert_from.py +++ b/src/calibre/ebooks/lrf/html/convert_from.py @@ -8,11 +8,10 @@ import re import sys import tempfile from collections import deque -from functools import partial from itertools import chain from math import ceil, floor -from calibre import __appname__, entity_to_unicode, fit_image, force_unicode, preferred_encoding +from calibre import __appname__, entity_regex, entity_to_unicode, fit_image, force_unicode, preferred_encoding from calibre.constants import filesystem_encoding from calibre.devices.interface import DevicePlugin as Device from calibre.ebooks import ConversionError @@ -123,8 +122,7 @@ class HTMLConverter: re.IGNORECASE), lambda m: '
'), # Replace entities - (re.compile(r'&(\S+?);'), partial(entity_to_unicode, - exceptions=['lt', 'gt', 'amp', 'quot'])), + (entity_regex(), entity_to_unicode), # Remove comments from within style tags as they can mess up BeatifulSoup (re.compile(r'()', re.IGNORECASE|re.DOTALL), strip_style_comments), diff --git a/src/calibre/ebooks/lrf/objects.py b/src/calibre/ebooks/lrf/objects.py index 63f02b9d7f..9f9f0d92c6 100644 --- a/src/calibre/ebooks/lrf/objects.py +++ b/src/calibre/ebooks/lrf/objects.py @@ -7,7 +7,8 @@ import re import struct import zlib -from calibre import entity_to_unicode, prepare_string_for_xml +from calibre import prepare_string_for_xml +from calibre.ebooks.html_entities import entity_to_unicode_in_python from calibre.ebooks.lrf import PRS500_PROFILE, LRFParseError from calibre.ebooks.lrf.tags import Tag @@ -711,7 +712,7 @@ class Text(LRFStream): s = str(text, "utf-16-le") if s: s = s.translate(self.text_map) - self.content.append(self.entity_pattern.sub(entity_to_unicode, s)) + self.content.append(self.entity_pattern.sub(entity_to_unicode_in_python, s)) def end_container(self, tag, stream): self.content.append(None) diff --git a/src/calibre/ebooks/lrf/pylrs/pylrs.py b/src/calibre/ebooks/lrf/pylrs/pylrs.py index d110f90af1..e6956e14c7 100644 --- a/src/calibre/ebooks/lrf/pylrs/pylrs.py +++ b/src/calibre/ebooks/lrf/pylrs/pylrs.py @@ -65,7 +65,7 @@ from .pylrf import ( DEFAULT_SOURCE_ENCODING = "cp1252" # default is us-windows character set DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs -from calibre import __appname__, __version__, entity_to_unicode +from calibre import __appname__, __version__, replace_entities from polyglot.builtins import iteritems, native_string_type, string_or_bytes @@ -752,7 +752,7 @@ class TableOfContents: class TocLabel: def __init__(self, label, textBlock): - self.label = escape(re.sub(r'&(\S+?);', entity_to_unicode, label)) + self.label = escape(replace_entities(label)) self.textBlock = textBlock def toElement(self, se): diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index 12fc7cd6bf..b6051136d8 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -14,7 +14,7 @@ import textwrap from lxml import etree, html -from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode +from calibre import guess_type, replace_entities, xml_replace_entities from calibre.ebooks import DRMError, unit_convert from calibre.ebooks.chardet import strip_encoding_declarations from calibre.ebooks.compression.palmdoc import decompress_doc @@ -181,8 +181,7 @@ class MobiReader: self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) - self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, - self.processed_html) + self.processed_html = xml_replace_entities(self.processed_html) image_name_map = self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() @@ -707,7 +706,6 @@ class MobiReader: ncx_manifest_entry = 'toc.ncx' elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1]) tocobj = None - ent_pat = re.compile(r'&(\S+?);') if elems: tocobj = TOC() found = False @@ -724,7 +722,7 @@ class MobiReader: x.xpath('descendant::text()')]) except: text = '' - text = ent_pat.sub(entity_to_unicode, text) + text = replace_entities(text) item = tocobj.add_item(toc.partition('#')[0], href[1:], text) item.left_space = int(self.get_left_whitespace(x)) diff --git a/src/calibre/library/catalogs/epub_mobi_builder.py b/src/calibre/library/catalogs/epub_mobi_builder.py index 246e7d835e..d1ef6c5974 100644 --- a/src/calibre/library/catalogs/epub_mobi_builder.py +++ b/src/calibre/library/catalogs/epub_mobi_builder.py @@ -19,7 +19,6 @@ from calibre.constants import cache_dir, ismacos from calibre.customize.conversion import DummyReporter from calibre.customize.ui import output_profiles from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify -from calibre.ebooks.chardet import substitute_entites from calibre.ebooks.metadata import author_to_author_sort from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree from calibre.library.catalogs import AuthorSortMismatchException, EmptyCatalogException, InvalidGenresSourceFieldException @@ -2670,7 +2669,7 @@ class CatalogBuilder: args[k] = v.decode('utf-8') generated_html = P('catalog/template.xhtml', data=True).decode('utf-8').format(**args) - generated_html = substitute_entites(generated_html) + generated_html = xml_replace_entities(generated_html) return BeautifulSoup(generated_html) # Generate the template arguments diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index adb7d69c3c..5d2cad1846 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -12,7 +12,7 @@ import time import traceback from builtins import _ -from calibre import entity_to_unicode, force_unicode, strftime +from calibre import force_unicode, replace_entities, strftime from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.utils.date import dt_factory, local_tz, utcnow from calibre.utils.logging import default_log @@ -30,9 +30,8 @@ class Article: title = force_unicode(title, 'utf-8') self._title = clean_xml_chars(title).strip() try: - self._title = re.sub(r'&(\S+?);', - entity_to_unicode, self._title) - except: + self._title = replace_entities(self._title) + except Exception: pass self._title = clean_ascii_chars(self._title) self.url = url