From ad8d8cfcb5448a89d13a6956dd72dfdc9bbf31b9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 2 Apr 2019 07:08:44 +0530
Subject: [PATCH] py3: Make strip_encoding_declarations() work with both bytes
 and unicode

---
 src/calibre/ebooks/chardet.py            | 66 +++++++++++++++++++-----
 src/calibre/ebooks/mobi/reader/mobi6.py  |  5 +-
 src/calibre/ebooks/oeb/polish/parsing.py | 15 +-----
 3 files changed, 58 insertions(+), 28 deletions(-)
diff --git a/src/calibre/ebooks/chardet.py b/src/calibre/ebooks/chardet.py
index 6d08cb61fd..74908de22b 100644
--- a/src/calibre/ebooks/chardet.py
+++ b/src/calibre/ebooks/chardet.py
@@ -10,22 +10,52 @@ __docformat__ = 'restructuredtext en'
 import re, codecs
 from polyglot.builtins import unicode_type
 
-ENCODING_PATS = [
+_encoding_pats = (
     # XML declaration
-    re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
+    r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
     # HTML 5 charset
-    re.compile(r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
+    r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
     # HTML 4 Pragma directive
-    re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''', re.IGNORECASE),
-]
+    r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
+)
+
+
+def compile_pats(binary):
+    for raw in _encoding_pats:
+        if binary:
+            raw = raw.encode('ascii')
+        yield re.compile(raw, flags=re.IGNORECASE)
+
+
+class LazyEncodingPats(object):
+
+    def __call__(self, binary=False):
+        attr = 'binary_pats' if binary else 'unicode_pats'
+        pats = getattr(self, attr, None)
+        if pats is None:
+            pats = tuple(compile_pats(binary))
+            setattr(self, attr, pats)
+        for pat in pats:
+            yield pat
+
+
+lazy_encoding_pats = LazyEncodingPats()
 ENTITY_PATTERN = re.compile(r'&(\S+?);')
 
 
-def strip_encoding_declarations(raw, limit=50*1024):
+def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
     prefix = raw[:limit]
     suffix = raw[limit:]
-    for pat in ENCODING_PATS:
-        prefix = pat.sub('', prefix)
+    is_binary = isinstance(raw, bytes)
+    if preserve_newlines:
+        if is_binary:
+            sub = lambda m: b'\n' * m.group().count(b'\n')
+        else:
+            sub = lambda m: '\n' * m.group().count('\n')
+    else:
+        sub = b'' if is_binary else u''
+    for pat in lazy_encoding_pats(is_binary):
+        prefix = pat.sub(sub, prefix)
     raw = prefix + suffix
     return raw
 
@@ -34,6 +64,13 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
     prefix = raw[:limit]
     suffix = raw[limit:]
     changed = [False]
+    is_binary = isinstance(raw, bytes)
+    if is_binary:
+        if not isinstance(enc, bytes):
+            enc = enc.encode('ascii')
+    else:
+        if isinstance(enc, bytes):
+            enc = enc.decode('ascii')
 
     def sub(m):
         ans = m.group()
@@ -43,7 +80,7 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
             ans = ans[:start] + enc + ans[end:]
         return ans
 
-    for pat in ENCODING_PATS:
+    for pat in lazy_encoding_pats(is_binary):
         prefix = pat.sub(sub, prefix)
     raw = prefix + suffix
     return raw, changed[0]
@@ -51,10 +88,14 @@ def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
 
 def find_declared_encoding(raw, limit=50*1024):
     prefix = raw[:limit]
-    for pat in ENCODING_PATS:
+    is_binary = isinstance(raw, bytes)
+    for pat in lazy_encoding_pats(is_binary):
         m = pat.search(prefix)
         if m is not None:
-            return m.group(1)
+            ans = m.group(1)
+            if is_binary:
+                ans = ans.decode('ascii', 'replace')
+                return ans
 
 
 def substitute_entites(raw):
@@ -102,10 +143,11 @@ def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
         if raw.startswith(bom):
             return raw[len(bom):], x
     encoding = None
-    for pat in ENCODING_PATS:
+    for pat in lazy_encoding_pats(True):
         match = pat.search(raw)
         if match:
             encoding = match.group(1)
+            encoding = encoding.decode('ascii', 'replace')
             break
     if encoding is None:
         encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
diff --git a/src/calibre/ebooks/mobi/reader/mobi6.py b/src/calibre/ebooks/mobi/reader/mobi6.py
index fea1cc78ed..d6bd61e0df 100644
--- a/src/calibre/ebooks/mobi/reader/mobi6.py
+++ b/src/calibre/ebooks/mobi/reader/mobi6.py
@@ -13,7 +13,7 @@ from lxml import html, etree
 from calibre import (xml_entity_to_unicode, entity_to_unicode)
 from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
 from calibre.ebooks import DRMError, unit_convert
-from calibre.ebooks.chardet import ENCODING_PATS
+from calibre.ebooks.chardet import strip_encoding_declarations
 from calibre.ebooks.mobi import MobiError
 from calibre.ebooks.mobi.huffcdic import HuffReader
 from calibre.ebooks.compression.palmdoc import decompress_doc
@@ -175,8 +175,7 @@ class MobiReader(object):
         self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
                 self.processed_html)
 
-        for pat in ENCODING_PATS:
-            self.processed_html = pat.sub('', self.processed_html)
+        self.processed_html = strip_encoding_declarations(self.processed_html)
         self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
             self.processed_html)
         self.extract_images(processed_records, output_dir)
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index e89275393c..d92ebeb382 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -12,7 +12,7 @@ from lxml.etree import XMLParser, fromstring, Element as LxmlElement
 import html5_parser
 
 from calibre import xml_replace_entities
-from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
+from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
 from calibre.utils.cleantext import clean_xml_chars
 
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
@@ -33,17 +33,6 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
     return root
 
 
-def strip_encoding_declarations(raw):
-    # A custom encoding stripper that preserves line numbers
-    limit = 10*1024
-    for pat in ENCODING_PATS:
-        prefix = raw[:limit]
-        suffix = raw[limit:]
-        prefix = pat.sub(lambda m: '\n' * m.group().count('\n'), prefix)
-        raw = prefix + suffix
-    return raw
-
-
 def handle_private_entities(data):
     # Process private entities
     pre = ''
@@ -84,7 +73,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
         raw = ('\n' * newlines) + raw[match.start():]
         break
 
-    raw = strip_encoding_declarations(raw)
+    raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
     if force_html5_parse:
         return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
     try: