From 33086f206129bc03ab041c8003a59de3760c0ab0 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 10 Dec 2013 11:31:14 +0530
Subject: [PATCH] Special case entities when checking for XML well-formedness

---
 src/calibre/ebooks/oeb/polish/check/main.py   |  5 +-
 .../ebooks/oeb/polish/check/parsing.py        | 95 ++++++++++++++++++-
 src/calibre/ebooks/oeb/polish/container.py    | 13 +--
 src/calibre/ebooks/oeb/polish/utils.py        | 24 +++++
 4 files changed, 121 insertions(+), 16 deletions(-)
 create mode 100644 src/calibre/ebooks/oeb/polish/utils.py
diff --git a/src/calibre/ebooks/oeb/polish/check/main.py b/src/calibre/ebooks/oeb/polish/check/main.py
index f6db7d2e7c..ad2c33d55d 100644
--- a/src/calibre/ebooks/oeb/polish/check/main.py
+++ b/src/calibre/ebooks/oeb/polish/check/main.py
@@ -52,7 +52,10 @@ def fix_errors(container, errors):
 
     for err in errors:
         if err.INDIVIDUAL_FIX:
-            if err(container):
+            if err(container) is not False:
+                # Assume changed unless fixer explicitly says no change (this
+                # is because sometimes I forget to return True, and it is
+                # better to have a false positive than a false negative)
                 changed = True
     return changed
 
diff --git a/src/calibre/ebooks/oeb/polish/check/parsing.py b/src/calibre/ebooks/oeb/polish/check/parsing.py
index c34609f1ab..bd351b5e86 100644
--- a/src/calibre/ebooks/oeb/polish/check/parsing.py
+++ b/src/calibre/ebooks/oeb/polish/check/parsing.py
@@ -6,11 +6,21 @@ from __future__ import (unicode_literals, division, absolute_import,
 __license__ = 'GPL v3'
 __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
 
+import re
+
 from lxml.etree import XMLParser, fromstring, XMLSyntaxError
 
-from calibre.ebooks.oeb.polish.check.base import BaseError
+from calibre.ebooks.html_entities import html5_entities
+from calibre.ebooks.oeb.polish.utils import PositionFinder
+from calibre.ebooks.oeb.polish.check.base import BaseError, WARN
 from calibre.ebooks.oeb.base import OEB_DOCS
 
+HTML_ENTITTIES = frozenset(html5_entities)
+XML_ENTITIES = {'lt', 'gt', 'amp', 'apos', 'quot'}
+ALL_ENTITIES = HTML_ENTITTIES | XML_ENTITIES
+
+replace_pat = re.compile('&(%s);' % '|'.join(re.escape(x) for x in sorted((HTML_ENTITTIES - XML_ENTITIES))))
+
 class XMLParseError(BaseError):
 
     is_parsing_error = True
@@ -30,19 +40,94 @@ class HTMLParseError(XMLParseError):
              ' incorrect display of content. These errors can usually be fixed automatically,'
              ' however, automatic fixing can sometimes "do the wrong thing".')
 
+class NamedEntities(BaseError):
+
+    level = WARN
+    INDIVIDUAL_FIX = _('Replace all named entities with their character equivalents in this file')
+    HELP = _('Named entities are often only incompletely supported by various book reading software.'
+             ' Therefore, it is best to not use them, replacing them with the actual characters they'
+             ' represent. This can be done automatically.')
+
+    def __init__(self, name):
+        BaseError.__init__(self, _('Named entities present'), name)
+
+    def __call__(self, container):
+        raw = container.raw_data(self.name)
+        nraw = replace_pat.sub(lambda m:html5_entities[m.group(1)], raw)
+        with container.open(self.name, 'wb') as f:
+            f.write(nraw.encode('utf-8'))
+        return True
+
+class BadEntity(BaseError):
+
+    HELP = _('This is an invalid (unrecognized) entity. Replace it with whatever'
+             ' text it is supposed to have represented.')
+
+    def __init__(self, ent, name, lnum, col):
+        BaseError.__init__(self, _('Invalid entity: %s') % ent, name, lnum, col)
+
+
+class EntitityProcessor(object):
+
+    def __init__(self, mt):
+        self.entities = ALL_ENTITIES if mt in OEB_DOCS else XML_ENTITIES
+        self.ok_named_entities = []
+        self.bad_entities = []
+
+    def __call__(self, m):
+        val = m.group(1).decode('ascii')
+        if val in XML_ENTITIES:
+            # Leave XML entities alone
+            return m.group()
+
+        if val.startswith('#'):
+            nval = val[1:]
+            try:
+                if nval.startswith('x'):
+                    int(nval[1:], 16)
+                else:
+                    int(nval, 10)
+            except ValueError:
+                # Invalid numerical entity
+                self.bad_entities.append((m.start(), m.group()))
+                return b' ' * len(m.group())
+            return m.group()
+
+        if val in self.entities:
+            # Known named entity, report it
+            self.ok_named_entities.append(m.start())
+        else:
+            self.bad_entities.append((m.start(), m.group()))
+        return b' ' * len(m.group())
+
+entity_pat = re.compile(br'&(#{0,1}[a-zA-Z0-9]{1,8});')
+
 def check_xml_parsing(name, mt, raw):
+    raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
+    # Get rid of entities as named entities trip up the XML parser
+    eproc = EntitityProcessor(mt)
+    eraw = entity_pat.sub(eproc, raw)
     parser = XMLParser(recover=False)
     errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
+    errors = []
+    if eproc.ok_named_entities:
+        errors.append(NamedEntities(name))
+    if eproc.bad_entities:
+        position = PositionFinder(raw)
+        for offset, ent in eproc.bad_entities:
+            lnum, col = position(offset)
+            errors.append(BadEntity(ent, name, lnum, col))
 
     try:
-        fromstring(raw, parser=parser)
+        fromstring(eraw, parser=parser)
     except XMLSyntaxError as err:
         try:
             line, col = err.position
         except:
             line = col = None
-        return [errcls(err.message, name, line, col)]
+        return errors + [errcls(err.message, name, line, col)]
     except Exception as err:
-        return [errcls(err.message, name)]
-    return []
+        return errors + [errcls(err.message, name)]
+
+    return errors
 
diff --git a/src/calibre/ebooks/oeb/polish/container.py b/src/calibre/ebooks/oeb/polish/container.py
index c6adc01c0f..32cbca2a91 100644
--- a/src/calibre/ebooks/oeb/polish/container.py
+++ b/src/calibre/ebooks/oeb/polish/container.py
@@ -9,7 +9,6 @@ __docformat__ = 'restructuredtext en'
 
 import os, logging, sys, hashlib, uuid, re, shutil
 from collections import defaultdict
-from bisect import bisect
 from io import BytesIO
 from urlparse import urlparse
 from future_builtins import zip
@@ -32,6 +31,7 @@ from calibre.ebooks.oeb.base import (
     rewrite_links, iterlinks, itercsslinks, urlquote, urlunquote)
 from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
 from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
+from calibre.ebooks.oeb.polish.utils import PositionFinder
 from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
 from calibre.utils.filenames import nlinks_file, hardlink_file
@@ -293,15 +293,8 @@ class Container(object):  # {{{
         elif media_type.lower() in OEB_STYLES:
             if get_line_numbers:
                 with self.open(name) as f:
-                    raw = self.decode(f.read())
-                    new_lines = tuple(m.start() + 1 for m in re.finditer(r'\n', raw))
-                    def position(pos):
-                        lnum = bisect(new_lines, pos)
-                        try:
-                            offset = abs(pos - new_lines[lnum - 1])
-                        except IndexError:
-                            offset = pos
-                        return (lnum + 1, offset)
+                    raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
+                    position = PositionFinder(raw)
                     for link, offset in itercsslinks(raw):
                         lnum, col = position(offset)
                         yield link, lnum, col
diff --git a/src/calibre/ebooks/oeb/polish/utils.py b/src/calibre/ebooks/oeb/polish/utils.py
new file mode 100644
index 0000000000..3712e522d8
--- /dev/null
+++ b/src/calibre/ebooks/oeb/polish/utils.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__ = 'GPL v3'
+__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
+
+import re
+from bisect import bisect
+
+class PositionFinder(object):
+
+    def __init__(self, raw):
+        pat = br'\n' if isinstance(raw, bytes) else r'\n'
+        self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
+
+    def __call__(self, pos):
+        lnum = bisect(self.new_lines, pos)
+        try:
+            offset = abs(pos - self.new_lines[lnum - 1])
+        except IndexError:
+            offset = pos
+        return (lnum + 1, offset)