From 44799e05efc6a4696f98a8fcf4f7350876427bb2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 29 Mar 2009 21:09:04 -0700
Subject: [PATCH] Conversion pipeline: Dont choke on HTML/CSS files that fail
 to parse correctly. Instead remove them from the mainfest. Preprocessing code
 migrated from epub layer to OEBBook.

---
 session.vim                                 |   2 +-
 src/calibre/customize/conversion.py         |  37 +-----
 src/calibre/customize/profiles.py           |  28 ++---
 src/calibre/ebooks/conversion/plumber.py    |  10 +-
 src/calibre/ebooks/conversion/preprocess.py | 123 ++++++++++++++++++++
 src/calibre/ebooks/mobi/input.py            |  21 ++--
 src/calibre/ebooks/oeb/base.py              |  42 ++++---
 src/calibre/ebooks/oeb/reader.py            |  86 +++++++++-----
 8 files changed, 242 insertions(+), 107 deletions(-)
 create mode 100644 src/calibre/ebooks/conversion/preprocess.py

diff --git a/session.vim b/session.vim
index 9d326c5822..454b468ae0 100644
--- a/session.vim
+++ b/session.vim
@@ -1,5 +1,5 @@
 " Project wide builtins
-let g:pyflakes_builtins += ["dynamic_property"]
+let g:pyflakes_builtins += ["dynamic_property", '__']
 
 python << EOFPY
 import os
diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index 5cf497d904..3ebabc4d52 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -4,8 +4,6 @@ Defines the plugin system for conversions.
 '''
 import re, os, shutil
 
-from lxml import html
-
 from calibre import CurrentDir
 from calibre.customize import Plugin
 
@@ -121,7 +119,7 @@ class InputFormatPlugin(Plugin):
     #: (option_name, recommended_value, recommendation_level)
     recommendations = set([])
 
-    def convert(self, stream, options, file_ext, parse_cache, log, accelerators):
+    def convert(self, stream, options, file_ext, log, accelerators):
         '''
         This method must be implemented in sub-classes. It must return
         the path to the created OPF file. All output should be contained in
@@ -144,17 +142,6 @@ class InputFormatPlugin(Plugin):
                          is guaranteed to be one of the `file_types` supported
                          by this plugin.
 
-        :param parse_cache:    A dictionary that maps absolute file paths to
-                               parsed representations of their contents. For
-                               HTML the representation is an lxml element of
-                               the root of the tree. For CSS it is a cssutils
-                               stylesheet. If this plugin parses any of the
-                               output files, it should add them to the cache
-                               so that later stages of the conversion wont
-                               have to re-parse them. If a parsed representation
-                               is in the cache, there is no need to actually
-                               write the file to disk.
-
         :param log: A :class:`calibre.utils.logging.Log` object. All output
                     should use this object.
 
@@ -165,7 +152,7 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError
 
-    def __call__(self, stream, options, file_ext, parse_cache, log,
+    def __call__(self, stream, options, file_ext, log,
                  accelerators, output_dir):
         log('InputFormatPlugin: %s running'%self.name, end=' ')
         if hasattr(stream, 'name'):
@@ -176,33 +163,15 @@ class InputFormatPlugin(Plugin):
                 shutil.rmtree(x) if os.path.isdir(x) else os.remove(x)
 
 
-            ret = self.convert(stream, options, file_ext, parse_cache,
+            ret = self.convert(stream, options, file_ext,
                                log, accelerators)
-            for key in list(parse_cache.keys()):
-                if os.path.abspath(key) != key:
-                    log.warn(('InputFormatPlugin: %s returned a '
-                             'relative path: %s')%(self.name, key)
-                             )
-                    parse_cache[os.path.abspath(key)] = parse_cache.pop(key)
-
         if options.debug_input is not None:
             options.debug_input = os.path.abspath(options.debug_input)
             if not os.path.exists(options.debug_input):
                 os.makedirs(options.debug_input)
             shutil.rmtree(options.debug_input)
-            for f, obj in parse_cache.items():
-                if hasattr(obj, 'cssText'):
-                    raw = obj.cssText
-                else:
-                    raw = html.tostring(obj, encoding='utf-8', method='xml',
-                         include_meta_content_type=True, pretty_print=True)
-                if isinstance(raw, unicode):
-                    raw = raw.encode('utf-8')
-                open(f, 'wb').write(raw)
             shutil.copytree('.', options.debug_input)
 
-
-
         return ret
 
 
diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py
index a3a7e22298..bd11a89bed 100644
--- a/src/calibre/customize/profiles.py
+++ b/src/calibre/customize/profiles.py
@@ -7,7 +7,7 @@ import sys, re
 from calibre.customize import Plugin
 
 class InputProfile(Plugin):
-    
+
     author = 'Kovid Goyal'
     supported_platforms = set(['windows', 'osx', 'linux'])
     can_be_disabled = False
@@ -20,40 +20,40 @@ class InputProfile(Plugin):
     short_name  = 'default' # Used in the CLI so dont use spaces etc. in it
     description = _('This profile tries to provide sane defaults and is useful '
                     'if you know nothing about the input document.')
-                  
+
 input_profiles = [InputProfile]
-    
+
 
 class OutputProfile(Plugin):
-    
+
     author = 'Kovid Goyal'
     supported_platforms = set(['windows', 'osx', 'linux'])
     can_be_disabled = False
     type = _('Output profile')
-    
+
     name        = 'Default Output Profile'
     short_name  = 'default' # Used in the CLI so dont use spaces etc. in it
     description = _('This profile tries to provide sane defaults and is useful '
                     'if you want to produce a document intended to be read at a '
                     'computer or on a range of devices.')
-    
+
     epub_flow_size            = sys.maxint
     screen_size               = None
-    remove_special_chars      = False
+    remove_special_chars      = None
     remove_object_tags        = False
-    
+
 class SonyReader(OutputProfile):
-    
+
     name        = 'Sony Reader'
     short_name  = 'sony'
     description = _('This profile is intended for the SONY PRS line. '
                     'The 500/505/700 etc.')
-     
+
     epub_flow_size            = 270000
     screen_size               = (590, 765)
     remove_special_chars      = re.compile(u'[\u200b\u00ad]')
     remove_object_tags        = True
-    
-    
-    
-output_profiles = [OutputProfile, SonyReader]
\ No newline at end of file
+
+
+
+output_profiles = [OutputProfile, SonyReader]
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 44e2fda0c3..0e2f98fde4 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -8,6 +8,7 @@ import os
 from calibre.customize.conversion import OptionRecommendation
 from calibre.customize.ui import input_profiles, output_profiles, \
         plugin_for_input_format, plugin_for_output_format
+from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
 
 class OptionValues(object):
     pass
@@ -258,16 +259,17 @@ OptionRecommendation(name='language',
         # heavy lifting.
         from calibre.ebooks.oeb.reader import OEBReader
         from calibre.ebooks.oeb.base import OEBBook
-        parse_cache, accelerators = {}, {}
+        accelerators = {}
 
         opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
-                                    self.input_fmt, parse_cache, self.log,
+                                    self.input_fmt, self.log,
                                     accelerators)
-
+        html_preprocessor = HTMLPreProcessor()
         self.reader = OEBReader()
-        self.oeb = OEBBook(self.log, parse_cache=parse_cache)
+        self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
         # Read OEB Book into OEBBook
         self.reader(self.oeb, opfpath)
 
 
 
+
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
new file mode 100644
index 0000000000..f544a331d8
--- /dev/null
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re, functools
+
+from calibre import entity_to_unicode
+
+XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
+SVG_NS       = 'http://www.w3.org/2000/svg'
+XLINK_NS     = 'http://www.w3.org/1999/xlink'
+
+convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
+_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+
+
+def sanitize_head(match):
+    x = match.group(1)
+    x = _span_pat.sub('', x)
+    return '<head>\n'+x+'\n</head>'
+
+
+class CSSPreProcessor(object):
+
+    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
+
+    def __call__(self, data):
+        data = self.PAGE_PAT.sub('', data)
+        return data
+
+class HTMLPreProcessor(object):
+
+    PREPROCESS = [
+                  # Some idiotic HTML generators (Frontpage I'm looking at you)
+                  # Put all sorts of crap into <head>. This messes up lxml
+                  (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
+                   sanitize_head),
+                  # Convert all entities, since lxml doesn't handle them well
+                  (re.compile(r'&(\S+?);'), convert_entities),
+                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
+                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
+                   lambda match: ''),
+                  ]
+
+    # Fix pdftohtml markup
+    PDFTOHTML  = [
+                  # Remove <hr> tags
+                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
+                  # Remove page numbers
+                  (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
+                  # Remove <br> and replace <br><br> with <p>
+                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
+                  (re.compile(r'(.*)<br.*?>', re.IGNORECASE),
+                   lambda match: match.group() if \
+                           re.match('<', match.group(1).lstrip()) or \
+                           len(match.group(1)) < 40  else match.group(1)),
+                  # Remove hyphenation
+                  (re.compile(r'-\n\r?'), lambda match: ''),
+
+                  # Remove gray background
+                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
+
+                  # Remove non breaking spaces
+                  (re.compile(ur'\u00a0'), lambda match : ' '),
+
+                  ]
+
+    # Fix Book Designer markup
+    BOOK_DESIGNER = [
+                     # HR
+                     (re.compile('<hr>', re.IGNORECASE),
+                      lambda match : '<span style="page-break-after:always"> </span>'),
+                     # Create header tags
+                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
+                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
+                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
+                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
+                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
+                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
+                     ]
+
+    def is_baen(self, src):
+        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
+                          re.IGNORECASE).search(src) is not None
+
+    def is_book_designer(self, raw):
+        return re.search('<H2[^><]*id=BookTitle', raw) is not None
+
+    def is_pdftohtml(self, src):
+        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+
+    def __call__(self, html, remove_special_chars=None):
+        if remove_special_chars is not None:
+            html = remove_special_chars.sub('', html)
+        if self.is_baen(html):
+            rules = []
+        elif self.is_book_designer(html):
+            rules = self.BOOK_DESIGNER
+        elif self.is_pdftohtml(html):
+            rules = self.PDFTOHTML
+        else:
+            rules = []
+        for rule in self.PREPROCESS + rules:
+            html = rule[0].sub(rule[1], html)
+
+        # Handle broken XHTML w/ SVG (ugh)
+        if 'svg:' in html and SVG_NS not in html:
+            html = html.replace(
+                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
+        if 'xlink:' in html and XLINK_NS not in html:
+            html = html.replace(
+                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
+
+        html = XMLDECL_RE.sub('', html)
+
+        return html
+
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index b3400c54e1..8f2e24a831 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -6,25 +6,28 @@ __docformat__ = 'restructuredtext en'
 from calibre.customize.conversion import InputFormatPlugin
 
 class MOBIInput(InputFormatPlugin):
-    
+
     name        = 'MOBI Input'
     author      = 'Kovid Goyal'
     description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
     file_types  = set(['mobi', 'prc', 'azw'])
-    
-    def convert(self, stream, options, file_ext, parse_cache, log, 
+
+    def convert(self, stream, options, file_ext, log,
                 accelerators):
         from calibre.ebooks.mobi.reader import MobiReader
-        mr = MobiReader(stream, log, options.input_encoding, 
+        from lxml import html
+        mr = MobiReader(stream, log, options.input_encoding,
                         options.debug_input)
+        parse_cache = {}
         mr.extract_content('.', parse_cache)
-        raw = parse_cache.get('calibre_raw_mobi_markup', False)
+        raw = parse_cache.pop('calibre_raw_mobi_markup', False)
         if raw:
             if isinstance(raw, unicode):
                 raw = raw.encode('utf-8')
             open('debug-raw.html', 'wb').write(raw)
         for f, root in parse_cache.items():
-            if '.' in f:
-                accelerators[f] = {'pagebreaks':root.xpath(
-                                            '//div[@class="mbp_pagebreak"]')}
-        return mr.created_opf_path
\ No newline at end of file
+            with open(f, 'wb') as q:
+                q.write(html.tostring(root, encoding='utf-8', method='xml',
+                    include_meta_content_type=False))
+            accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
+        return mr.created_opf_path
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index c1e3549b10..4ce984b9a8 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -20,6 +20,8 @@ from cssutils import CSSParser
 from calibre.translations.dynamic import translate
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
+from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, \
+        CSSPreProcessor
 
 XML_NS       = 'http://www.w3.org/XML/1998/namespace'
 XHTML_NS     = 'http://www.w3.org/1999/xhtml'
@@ -207,6 +209,10 @@ class OEBError(Exception):
     """Generic OEB-processing error."""
     pass
 
+class NotHTML(OEBError):
+    '''Raised when a file that should be HTML (as per manifest) is not'''
+    pass
+
 class NullContainer(object):
     """An empty container.
 
@@ -575,14 +581,7 @@ class Manifest(object):
         def _parse_xhtml(self, data):
             # Convert to Unicode and normalize line endings
             data = self.oeb.decode(data)
-            data = XMLDECL_RE.sub('', data)
-            # Handle broken XHTML w/ SVG (ugh)
-            if 'svg:' in data and SVG_NS not in data:
-                data = data.replace(
-                    '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
-            if 'xlink:' in data and XLINK_NS not in data:
-                data = data.replace(
-                    '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
+            data = self.oeb.html_preprocessor(data)
             # Try with more & more drastic measures to parse
             try:
                 data = etree.fromstring(data)
@@ -606,7 +605,7 @@ class Manifest(object):
                         data = etree.fromstring(data, parser=RECOVER_PARSER)
             # Force into the XHTML namespace
             if barename(data.tag) != 'html':
-                raise OEBError(
+                raise NotHTML(
                     'File %r does not appear to be (X)HTML' % self.href)
             elif not namespace(data.tag):
                 data.attrib['xmlns'] = XHTML_NS
@@ -659,6 +658,7 @@ class Manifest(object):
 
         def _parse_css(self, data):
             data = self.oeb.decode(data)
+            data = self.CSSPreProcessor(data)
             data = XHTML_CSS_NAMESPACE + data
             parser = CSSParser(log=self.oeb.logger, loglevel=logging.WARNING,
                                fetcher=self._fetch_css)
@@ -793,7 +793,7 @@ class Manifest(object):
         MIME type which is not one of the OPS core media types.  Either the
         item's data itself may be provided with :param:`data`, or a loader
         function for the data may be provided with :param:`loader`, or the
-        item's data may latter be set manually via the :attr:`data` attribute.
+        item's data may later be set manually via the :attr:`data` attribute.
         """
         item = self.Item(
             self.oeb, id, href, media_type, fallback, loader, data)
@@ -840,6 +840,9 @@ class Manifest(object):
         for item in self.items:
             yield item
 
+    def __len__(self):
+        return len(self.items)
+
     def values(self):
         return list(self.items)
 
@@ -1255,17 +1258,22 @@ class OEBBook(object):
     COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
     COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
 
-    def __init__(self, logger, parse_cache={}, encoding='utf-8',
-                 pretty_print=False):
-        """Create empty book.  Optional arguments:
+    def __init__(self, logger,
+            html_preprocessor=HTMLPreProcessor(),
+            css_preprocessor=CSSPreProcessor(),
+            encoding='utf-8', pretty_print=False):
+        """Create empty book.  Arguments:
 
-        :param parse_cache: A cache of parsed XHTML/CSS. Keys are absolute
-            paths to the cached files and values are lxml root objects and
-            cssutils stylesheets.
         :param:`encoding`: Default encoding for textual content read
             from an external container.
         :param:`pretty_print`: Whether or not the canonical string form
             of XML markup is pretty-printed.
+        :param html_preprocessor: A callable that takes a unicode object
+            and returns a unicode object. Will be called on all html files
+            before they are parsed.
+        :param css_preprocessor: A callable that takes a unicode object
+            and returns a unicode object. Will be called on all CSS files
+            before they are parsed.
         :param:`logger`: A Log object to use for logging all messages
             related to the processing of this book.  It is accessible
             via the instance data members :attr:`logger,log`.
@@ -1286,6 +1294,8 @@ class OEBBook(object):
         """
 
         self.encoding = encoding
+        self.html_preprocessor = html_preprocessor
+        self.css_preprocessor = css_preprocessor
         self.pretty_print = pretty_print
         self.logger = self.log = logger
         self.version = '2.0'
diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py
index c62540e15a..60c2cf23bf 100644
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@@ -32,13 +32,13 @@ __all__ = ['OEBReader']
 
 class OEBReader(object):
     """Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
-    
+
     COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
     COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
 
     Container = DirContainer
     """Container type used to access book files.  Override in sub-classes."""
-    
+
     DEFAULT_PROFILE = 'PRS505'
     """Default renderer profile for content read with this Reader."""
 
@@ -67,7 +67,7 @@ class OEBReader(object):
         opf = self._read_opf()
         self._all_from_opf(opf)
         return oeb
-    
+
     def _clean_opf(self, opf):
         nsmap = {}
         for elem in opf.iter(tag=etree.Element):
@@ -94,7 +94,7 @@ class OEBReader(object):
             for element in xpath(opf, tag):
                 nroot.append(element)
         return nroot
-    
+
     def _read_opf(self):
         data = self.oeb.container.read(None)
         data = self.oeb.decode(data)
@@ -111,7 +111,7 @@ class OEBReader(object):
             raise OEBError('Invalid namespace %r for OPF document' % ns)
         opf = self._clean_opf(opf)
         return opf
-    
+
     def _metadata_from_opf(self, opf):
         uid = opf.get('unique-identifier', None)
         self.oeb.uid = None
@@ -161,10 +161,30 @@ class OEBReader(object):
             self.logger.warn('Title not specified')
             metadata.add('title', self.oeb.translate(__('Unknown')))
 
-    def _manifest_add_missing(self):
+    def _manifest_prune_invalid(self):
+        '''
+        Remove items from manifest that contain invalid data. This prevents
+        catastrophic conversion failure, when a few files contain corrupted
+        data.
+        '''
+        bad = []
+        check = OEB_DOCS+OEB_STYLES
+        for item in list(self.oeb.manifest.values()):
+            if item.media_type in check:
+                try:
+                    item.data
+                except:
+                    self.logger.exception('Failed to parse content in %s'%
+                            item.href)
+                    bad.append(item)
+                    self.oeb.manifest.remove(item)
+        return bad
+
+    def _manifest_add_missing(self, invalid):
         manifest = self.oeb.manifest
         known = set(manifest.hrefs)
         unchecked = set(manifest.values())
+        bad = []
         while unchecked:
             new = set()
             for item in unchecked:
@@ -190,6 +210,13 @@ class OEBReader(object):
             unchecked.clear()
             for href in new:
                 known.add(href)
+                is_invalid = False
+                for item in invalid:
+                    if href == item.abshref(urlnormalize(href)):
+                        is_invalid = True
+                        break
+                if is_invalid:
+                    continue
                 if not self.oeb.container.exists(href):
                     self.logger.warn('Referenced file %r not found' % href)
                     continue
@@ -199,7 +226,7 @@ class OEBReader(object):
                 media_type = guessed or BINARY_MIME
                 added = manifest.add(id, href, media_type)
                 unchecked.add(added)
-    
+
     def _manifest_from_opf(self, opf):
         manifest = self.oeb.manifest
         for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
@@ -222,8 +249,9 @@ class OEBReader(object):
                 self.logger.warn(u'Duplicate manifest id %r' % id)
                 id, href = manifest.generate(id, href)
             manifest.add(id, href, media_type, fallback)
-        self._manifest_add_missing()
-    
+        invalid = self._manifest_prune_invalid()
+        self._manifest_add_missing(invalid)
+
     def _spine_add_extra(self):
         manifest = self.oeb.manifest
         spine = self.oeb.spine
@@ -256,7 +284,7 @@ class OEBReader(object):
                 self.logger.warn(
                     'Spine-referenced file %r not in spine' % item.href)
             spine.add(item, linear=False)
-    
+
     def _spine_from_opf(self, opf):
         spine = self.oeb.spine
         manifest = self.oeb.manifest
@@ -270,7 +298,7 @@ class OEBReader(object):
         if len(spine) == 0:
             raise OEBError("Spine is empty")
         self._spine_add_extra()
-    
+
     def _guide_from_opf(self, opf):
         guide = self.oeb.guide
         manifest = self.oeb.manifest
@@ -281,7 +309,7 @@ class OEBReader(object):
                 self.logger.warn(u'Guide reference %r not found' % href)
                 continue
             guide.add(elem.get('type'), elem.get('title'), href)
-    
+
     def _find_ncx(self, opf):
         result = xpath(opf, '/o2:package/o2:spine/@toc')
         if result:
@@ -294,9 +322,9 @@ class OEBReader(object):
         for item in self.oeb.manifest.values():
             if item.media_type == NCX_MIME:
                 self.oeb.manifest.remove(item)
-                return item                
+                return item
         return None
-    
+
     def _toc_from_navpoint(self, item, toc, navpoint):
         children = xpath(navpoint, 'ncx:navPoint')
         for child in children:
@@ -314,7 +342,7 @@ class OEBReader(object):
             klass = child.get('class')
             node = toc.add(title, href, id=id, klass=klass)
             self._toc_from_navpoint(item, node, child)
-    
+
     def _toc_from_ncx(self, item):
         if item is None:
             return False
@@ -328,7 +356,7 @@ class OEBReader(object):
         for navmap in navmaps:
             self._toc_from_navpoint(item, toc, navmap)
         return True
-    
+
     def _toc_from_tour(self, opf):
         result = xpath(opf, 'o2:tours/o2:tour')
         if not result:
@@ -345,11 +373,11 @@ class OEBReader(object):
             path, _ = urldefrag(urlnormalize(href))
             if path not in self.oeb.manifest.hrefs:
                 self.logger.warn('TOC reference %r not found' % href)
-                continue            
+                continue
             id = site.get('id')
             toc.add(title, href, id=id)
         return True
-    
+
     def _toc_from_html(self, opf):
         if 'toc' not in self.oeb.guide:
             return False
@@ -381,7 +409,7 @@ class OEBReader(object):
         for href in order:
             toc.add(' '.join(titles[href]), href)
         return True
-    
+
     def _toc_from_spine(self, opf):
         toc = self.oeb.toc
         titles = []
@@ -408,14 +436,14 @@ class OEBReader(object):
             if not item.linear: continue
             toc.add(title, item.href)
         return True
-    
+
     def _toc_from_opf(self, opf, item):
         if self._toc_from_ncx(item): return
         if self._toc_from_tour(opf): return
         self.logger.warn('No metadata table of contents found')
         if self._toc_from_html(opf): return
         self._toc_from_spine(opf)
-    
+
     def _pages_from_ncx(self, opf, item):
         if item is None:
             return False
@@ -436,7 +464,7 @@ class OEBReader(object):
             klass = ptarget.get('class')
             pages.add(name, href, type=type, id=id, klass=klass)
         return True
-    
+
     def _find_page_map(self, opf):
         result = xpath(opf, '/o2:package/o2:spine/@page-map')
         if result:
@@ -451,7 +479,7 @@ class OEBReader(object):
                 self.oeb.manifest.remove(item)
                 return item
         return None
-    
+
     def _pages_from_page_map(self, opf):
         item = self._find_page_map(opf)
         if item is None:
@@ -472,12 +500,12 @@ class OEBReader(object):
                 type = 'front'
             pages.add(name, href, type=type)
         return True
-    
+
     def _pages_from_opf(self, opf, item):
         if self._pages_from_ncx(opf, item): return
         if self._pages_from_page_map(opf): return
         return
-    
+
     def _cover_from_html(self, hcover):
         with TemporaryDirectory('_html_cover') as tdir:
             writer = OEBWriter()
@@ -488,7 +516,7 @@ class OEBReader(object):
         id, href = self.oeb.manifest.generate('cover', 'cover.jpeg')
         item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
         return item
-        
+
     def _locate_cover_image(self):
         if self.oeb.metadata.cover:
             id = str(self.oeb.metadata.cover[0])
@@ -525,14 +553,14 @@ class OEBReader(object):
             if item is not None and item.media_type in OEB_IMAGES:
                 return item
         return self._cover_from_html(hcover)
-        
+
     def _ensure_cover_image(self):
         cover = self._locate_cover_image()
         if self.oeb.metadata.cover:
             self.oeb.metadata.cover[0].value = cover.id
             return
         self.oeb.metadata.add('cover', cover.id)
-    
+
     def _all_from_opf(self, opf):
         self.oeb.version = opf.get('version', '1.2')
         self._metadata_from_opf(opf)
@@ -543,7 +571,7 @@ class OEBReader(object):
         self._toc_from_opf(opf, item)
         self._pages_from_opf(opf, item)
         self._ensure_cover_image()
-    
+
 
 def main(argv=sys.argv):
     reader = OEBReader()