From ad8d8cfcb5448a89d13a6956dd72dfdc9bbf31b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 2 Apr 2019 07:08:44 +0530 Subject: [PATCH] py3: Make strip_encoding_declarations() work with both bytes and unicode --- src/calibre/ebooks/chardet.py | 66 +++++++++++++++++++----- src/calibre/ebooks/mobi/reader/mobi6.py | 5 +- src/calibre/ebooks/oeb/polish/parsing.py | 15 +----- 3 files changed, 58 insertions(+), 28 deletions(-) diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py index 6d08cb61fd..74908de22b 100644 --- a/src/calibre/ebooks/chardet.py +++ b/src/calibre/ebooks/chardet.py @@ -10,22 +10,52 @@ __docformat__ = 'restructuredtext en' import re, codecs from polyglot.builtins import unicode_type -ENCODING_PATS = [ +_encoding_pats = ( # XML declaration - re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), + r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', # HTML 5 charset - re.compile(r''']*>(?:\s*){0,1}''', re.IGNORECASE), + r''']*>(?:\s*){0,1}''', # HTML 4 Pragma directive - re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', re.IGNORECASE), -] + r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', +) + + +def compile_pats(binary): + for raw in _encoding_pats: + if binary: + raw = raw.encode('ascii') + yield re.compile(raw, flags=re.IGNORECASE) + + +class LazyEncodingPats(object): + + def __call__(self, binary=False): + attr = 'binary_pats' if binary else 'unicode_pats' + pats = getattr(self, attr, None) + if pats is None: + pats = tuple(compile_pats(binary)) + setattr(self, attr, pats) + for pat in pats: + yield pat + + +lazy_encoding_pats = LazyEncodingPats() ENTITY_PATTERN = re.compile(r'&(\S+?);') -def strip_encoding_declarations(raw, limit=50*1024): +def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False): prefix = raw[:limit] suffix = raw[limit:] - for pat in ENCODING_PATS: - prefix = pat.sub('', prefix) + is_binary = isinstance(raw, bytes) + if preserve_newlines: + if is_binary: + sub = lambda m: b'\n' * m.group().count(b'\n') + else: + sub = lambda m: '\n' * m.group().count('\n') + else: + sub = b'' if is_binary else u'' + for pat in lazy_encoding_pats(is_binary): + prefix = pat.sub(sub, prefix) raw = prefix + suffix return raw @@ -34,6 +64,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): prefix = raw[:limit] suffix = raw[limit:] changed = [False] + is_binary = isinstance(raw, bytes) + if is_binary: + if not isinstance(enc, bytes): + enc = enc.encode('ascii') + else: + if isinstance(enc, bytes): + enc = enc.decode('ascii') def sub(m): ans = m.group() @@ -43,7 +80,7 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): ans = ans[:start] + enc + ans[end:] return ans - for pat in ENCODING_PATS: + for pat in lazy_encoding_pats(is_binary): prefix = pat.sub(sub, prefix) raw = prefix + suffix return raw, changed[0] @@ -51,10 +88,14 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): def find_declared_encoding(raw, limit=50*1024): prefix = raw[:limit] - for pat in ENCODING_PATS: + is_binary = isinstance(raw, bytes) + for pat in lazy_encoding_pats(is_binary): m = pat.search(prefix) if m is not None: - return m.group(1) + ans = m.group(1) + if is_binary: + ans = ans.decode('ascii', 'replace') + return ans def substitute_entites(raw): @@ -102,10 +143,11 @@ def detect_xml_encoding(raw, verbose=False, assume_utf8=False): if raw.startswith(bom): return raw[len(bom):], x encoding = None - for pat in ENCODING_PATS: + for pat in lazy_encoding_pats(True): match = pat.search(raw) if match: encoding = match.group(1) + encoding = encoding.decode('ascii', 'replace') break if encoding is None: encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8) diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py index fea1cc78ed..d6bd61e0df 100644 --- a/src/calibre/ebooks/mobi/reader/mobi6.py +++ b/src/calibre/ebooks/mobi/reader/mobi6.py @@ -13,7 +13,7 @@ from lxml import html, etree from calibre import (xml_entity_to_unicode, entity_to_unicode) from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.ebooks import DRMError, unit_convert -from calibre.ebooks.chardet import ENCODING_PATS +from calibre.ebooks.chardet import strip_encoding_declarations from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.compression.palmdoc import decompress_doc @@ -175,8 +175,7 @@ class MobiReader(object): self.processed_html = re.sub(r']*>', '', self.processed_html) - for pat in ENCODING_PATS: - self.processed_html = pat.sub('', self.processed_html) + self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py index e89275393c..d92ebeb382 100644 --- a/src/calibre/ebooks/oeb/polish/parsing.py +++ b/src/calibre/ebooks/oeb/polish/parsing.py @@ -12,7 +12,7 @@ from lxml.etree import XMLParser, fromstring, Element as LxmlElement import html5_parser from calibre import xml_replace_entities -from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS +from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from calibre.utils.cleantext import clean_xml_chars XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -33,17 +33,6 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb return root -def strip_encoding_declarations(raw): - # A custom encoding stripper that preserves line numbers - limit = 10*1024 - for pat in ENCODING_PATS: - prefix = raw[:limit] - suffix = raw[limit:] - prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix) - raw = prefix + suffix - return raw - - def handle_private_entities(data): # Process private entities pre = '' @@ -84,7 +73,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N raw = ('\n' * newlines) + raw[match.start():] break - raw = strip_encoding_declarations(raw) + raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: