From a0e845be912281dc9cf6cb6d349a3f20243a6d8b Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 19 May 2018 18:02:46 +0530
Subject: [PATCH] Fix handling of private entities

Conversion: Fix private entities that use the same name as an HTML entity not being handled correctly

Viewer: Fix HTML files with private entities displaying an artifact at
the top

Editor: Check Book: Show an error for HTML files with private entities

Editor: Fix HTML: Automatically resolve private entities

Fixes #1772157 [Private Named Entities problems](https://bugs.launchpad.net/calibre/+bug/1772157)
---
 src/calibre/ebooks/oeb/display/webview.py     |  5 +++-
 src/calibre/ebooks/oeb/parse_utils.py         | 14 +++++------
 .../ebooks/oeb/polish/check/parsing.py        | 16 ++++++++++++-
 src/calibre/ebooks/oeb/polish/container.py    |  4 ++--
 src/calibre/ebooks/oeb/polish/parsing.py      | 23 +++++++++++++++++++
 5 files changed, 51 insertions(+), 11 deletions(-)
diff --git a/src/calibre/ebooks/oeb/display/webview.py b/src/calibre/ebooks/oeb/display/webview.py
index 9dfc43c9c4..c4403e7b41 100644
--- a/src/calibre/ebooks/oeb/display/webview.py
+++ b/src/calibre/ebooks/oeb/display/webview.py
@@ -40,8 +40,11 @@ def cleanup_html(html):
     return html
 
 
+xml_detect_pat = re.compile(r'<!(?:\[CDATA\[|ENTITY)')
+
+
 def load_as_html(html):
-    return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and '<![CDATA[' not in html
+    return re.search(r'<[a-zA-Z0-9-]+:svg', html) is None and xml_detect_pat.search(html) is None
 
 
 def load_html(path, view, codec='utf-8', mime_type=None,
diff --git a/src/calibre/ebooks/oeb/parse_utils.py b/src/calibre/ebooks/oeb/parse_utils.py
index 3fc37580bd..8d97b88a7b 100644
--- a/src/calibre/ebooks/oeb/parse_utils.py
+++ b/src/calibre/ebooks/oeb/parse_utils.py
@@ -180,12 +180,6 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
             data = xml_to_unicode(data)[0]
 
     data = strip_encoding_declarations(data)
-    if preprocessor is not None:
-        data = preprocessor(data)
-
-    # There could be null bytes in data if it had &#0; entities in it
-    data = data.replace('\0', '')
-
     # Remove DOCTYPE declaration as it messes up parsing
     # In particular, it causes tostring to insert xmlns
     # declarations, which messes up the coercing logic
@@ -198,10 +192,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
         pre = data[:idx]
         data = data[idx:]
         if '<!DOCTYPE' in pre:  # Handle user defined entities
-            has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
             # kindlegen produces invalid xhtml with uppercase attribute names
             # if fed HTML 4 with uppercase attribute names, so try to detect
             # and compensate for that.
+            has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
+            # Process private entities
             user_entities = {}
             for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                 val = match.group(2)
@@ -212,6 +207,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
                 pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
                 data = pat.sub(lambda m:user_entities[m.group(1)], data)
 
+    if preprocessor is not None:
+        data = preprocessor(data)
+
+    # There could be null bytes in data if it had &#0; entities in it
+    data = data.replace('\0', '')
     data = raw = clean_word_doc(data, log)
 
     # Setting huge_tree=True causes crashes in windows with large files
diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py
index f80776faf6..3cc48e94f7 100644
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@@ -24,7 +24,7 @@ XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
 ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
 
 replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
-mismatch_pat = re.compile('tag mismatch:.+?line (\d+).+?line \d+')
+mismatch_pat = re.compile(r'tag mismatch:.+?line (\d+).+?line \d+')
 
 
 class EmptyFile(BaseError):
@@ -80,6 +80,13 @@ class HTMLParseError(XMLParseError):
              ' however, automatic fixing can sometimes "do the wrong thing".')
 
 
+class PrivateEntities(XMLParseError):
+
+    HELP = _('This HTML file uses private entities.'
+    ' These are not supported. You can try running "Fix HTML" from the Tools menu,'
+    ' which will try to automatically resolve the private entities.')
+
+
 class NamedEntities(BaseError):
 
     level = WARN
@@ -255,9 +262,16 @@ def check_encoding_declarations(name, container):
     return errors
 
 
+def check_for_private_entities(name, raw):
+    if re.search(br'<!DOCTYPE\s+.+?<!ENTITY\s+.+?]>', raw, flags=re.DOTALL) is not None:
+        return True
+
+
 def check_xml_parsing(name, mt, raw):
     if not raw:
         return [EmptyFile(name)]
+    if check_for_private_entities(name, raw):
+        return [PrivateEntities(_('Private entities found'), name)]
     raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
     # Get rid of entities as named entities trip up the XML parser
     eproc = EntitityProcessor(mt)
diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index 194aff4750..4c0bda3fd6 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -953,8 +953,8 @@ class Container(ContainerBase):  # {{{
             for child in mdata:
                 child.tail = '\n    '
                 try:
-                    if (child.get('name', '').startswith('calibre:') and
-                        child.get('content', '').strip() in {'{}', ''}):
+                    if (child.get('name', '').startswith('calibre:'
+                        ) and child.get('content', '').strip() in {'{}', ''}):
                         remove.add(child)
                 except AttributeError:
                     continue  # Happens for XML comments
diff --git a/src/calibre/ebooks/oeb/polish/parsing.py b/src/calibre/ebooks/oeb/polish/parsing.py
index efab19a0e5..20f4065839 100644
--- a/src/calibre/ebooks/oeb/polish/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/parsing.py
@@ -44,9 +44,32 @@ def strip_encoding_declarations(raw):
     return raw
 
 
+def handle_private_entities(data):
+    # Process private entities
+    pre = ''
+    idx = data.find('<html')
+    if idx == -1:
+        idx = data.find('<HTML')
+    if idx > -1:
+        pre = data[:idx]
+        data = data[idx:]
+        if '<!DOCTYPE' in pre:  # Handle user defined entities
+            user_entities = {}
+            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
+                val = match.group(2)
+                if val.startswith('"') and val.endswith('"'):
+                    val = val[1:-1]
+                user_entities[match.group(1)] = val
+            if user_entities:
+                pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
+                data = pat.sub(lambda m:user_entities[m.group(1)], data)
+    return data
+
+
 def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
     if isinstance(raw, bytes):
         raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
+    raw = handle_private_entities(raw)
     if replace_entities:
         raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
     raw = raw.replace('\r\n', '\n').replace('\r', '\n')