py3: Make strip_encoding_declarations() work with both bytes and unicode

This commit is contained in:
Kovid Goyal 2019-04-02 07:08:44 +05:30
parent 36b63758a2
commit ad8d8cfcb5
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 58 additions and 28 deletions

View File

@ -10,22 +10,52 @@ __docformat__ = 'restructuredtext en'
import re, codecs import re, codecs
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
ENCODING_PATS = [ _encoding_pats = (
# XML declaration # XML declaration
re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
# HTML 5 charset # HTML 5 charset
re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE), r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
# HTML 4 Pragma directive # HTML 4 Pragma directive
re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE), r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
] )
def compile_pats(binary):
for raw in _encoding_pats:
if binary:
raw = raw.encode('ascii')
yield re.compile(raw, flags=re.IGNORECASE)
class LazyEncodingPats(object):
def __call__(self, binary=False):
attr = 'binary_pats' if binary else 'unicode_pats'
pats = getattr(self, attr, None)
if pats is None:
pats = tuple(compile_pats(binary))
setattr(self, attr, pats)
for pat in pats:
yield pat
lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);') ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw, limit=50*1024): def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
prefix = raw[:limit] prefix = raw[:limit]
suffix = raw[limit:] suffix = raw[limit:]
for pat in ENCODING_PATS: is_binary = isinstance(raw, bytes)
prefix = pat.sub('', prefix) if preserve_newlines:
if is_binary:
sub = lambda m: b'\n' * m.group().count(b'\n')
else:
sub = lambda m: '\n' * m.group().count('\n')
else:
sub = b'' if is_binary else u''
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix raw = prefix + suffix
return raw return raw
@ -34,6 +64,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
prefix = raw[:limit] prefix = raw[:limit]
suffix = raw[limit:] suffix = raw[limit:]
changed = [False] changed = [False]
is_binary = isinstance(raw, bytes)
if is_binary:
if not isinstance(enc, bytes):
enc = enc.encode('ascii')
else:
if isinstance(enc, bytes):
enc = enc.decode('ascii')
def sub(m): def sub(m):
ans = m.group() ans = m.group()
@ -43,7 +80,7 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
ans = ans[:start] + enc + ans[end:] ans = ans[:start] + enc + ans[end:]
return ans return ans
for pat in ENCODING_PATS: for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix) prefix = pat.sub(sub, prefix)
raw = prefix + suffix raw = prefix + suffix
return raw, changed[0] return raw, changed[0]
@ -51,10 +88,14 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
def find_declared_encoding(raw, limit=50*1024): def find_declared_encoding(raw, limit=50*1024):
prefix = raw[:limit] prefix = raw[:limit]
for pat in ENCODING_PATS: is_binary = isinstance(raw, bytes)
for pat in lazy_encoding_pats(is_binary):
m = pat.search(prefix) m = pat.search(prefix)
if m is not None: if m is not None:
return m.group(1) ans = m.group(1)
if is_binary:
ans = ans.decode('ascii', 'replace')
return ans
def substitute_entites(raw): def substitute_entites(raw):
@ -102,10 +143,11 @@ def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
if raw.startswith(bom): if raw.startswith(bom):
return raw[len(bom):], x return raw[len(bom):], x
encoding = None encoding = None
for pat in ENCODING_PATS: for pat in lazy_encoding_pats(True):
match = pat.search(raw) match = pat.search(raw)
if match: if match:
encoding = match.group(1) encoding = match.group(1)
encoding = encoding.decode('ascii', 'replace')
break break
if encoding is None: if encoding is None:
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8) encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)

View File

@ -13,7 +13,7 @@ from lxml import html, etree
from calibre import (xml_entity_to_unicode, entity_to_unicode) from calibre import (xml_entity_to_unicode, entity_to_unicode)
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.ebooks import DRMError, unit_convert from calibre.ebooks import DRMError, unit_convert
from calibre.ebooks.chardet import ENCODING_PATS from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.mobi import MobiError from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.compression.palmdoc import decompress_doc from calibre.ebooks.compression.palmdoc import decompress_doc
@ -175,8 +175,7 @@ class MobiReader(object):
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
self.processed_html) self.processed_html)
for pat in ENCODING_PATS: self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = pat.sub('', self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
self.processed_html) self.processed_html)
self.extract_images(processed_records, output_dir) self.extract_images(processed_records, output_dir)

View File

@ -12,7 +12,7 @@ from lxml.etree import XMLParser, fromstring, Element as LxmlElement
import html5_parser import html5_parser
from calibre import xml_replace_entities from calibre import xml_replace_entities
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
XHTML_NS = 'http://www.w3.org/1999/xhtml' XHTML_NS = 'http://www.w3.org/1999/xhtml'
@ -33,17 +33,6 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
return root return root
def strip_encoding_declarations(raw):
# A custom encoding stripper that preserves line numbers
limit = 10*1024
for pat in ENCODING_PATS:
prefix = raw[:limit]
suffix = raw[limit:]
prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix)
raw = prefix + suffix
return raw
def handle_private_entities(data): def handle_private_entities(data):
# Process private entities # Process private entities
pre = '' pre = ''
@ -84,7 +73,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
raw = ('\n' * newlines) + raw[match.start():] raw = ('\n' * newlines) + raw[match.start():]
break break
raw = strip_encoding_declarations(raw) raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
if force_html5_parse: if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
try: try: