From 91bb71ed8467cf9a5608b27c4d505141caa87a21 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 25 Apr 2009 08:26:58 -0700
Subject: [PATCH] Misc. minor fixes

---
 src/calibre/ebooks/epub/output.py          | 23 +++++++++++-----------
 src/calibre/ebooks/mobi/reader.py          |  5 +++++
 src/calibre/ebooks/oeb/transforms/guide.py | 14 -------------
 3 files changed, 17 insertions(+), 25 deletions(-)
diff --git a/src/calibre/ebooks/epub/output.py b/src/calibre/ebooks/epub/output.py
index a43ca4e5e3..1b37f054b0 100644
--- a/src/calibre/ebooks/epub/output.py
+++ b/src/calibre/ebooks/epub/output.py
@@ -91,7 +91,7 @@ class EPUBOutput(OutputFormatPlugin):
             self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)\
                     if x.endswith('.ncx')][0])
 
-            from calibre.epub import initialize_container
+            from calibre.ebooks.epub import initialize_container
             epub = initialize_container(output_path, os.path.basename(opf))
             epub.add_dir(tdir)
             epub.close()
@@ -136,7 +136,7 @@ class EPUBOutput(OutputFormatPlugin):
             if 'cover' in g:
                 tp = self.TITLEPAGE_COVER%unquote(g['cover'].href)
                 id, href = m.generate('titlepage', 'titlepage.xhtml')
-                item = m.add(id, href, guess_type('t.xhtml'),
+                item = m.add(id, href, guess_type('t.xhtml')[0],
                         data=etree.fromstring(tp))
             else:
                 item = self.default_cover()
@@ -146,7 +146,8 @@ class EPUBOutput(OutputFormatPlugin):
         if item is not None:
             self.oeb.spine.insert(0, item, True)
             self.oeb.guide.refs['cover'].href = item.href
-            self.oeb.guide.refs['titlepage'].href = item.href
+            if 'titlepage' in self.oeb.guide.refs:
+                self.oeb.guide.refs['titlepage'].href = item.href
 
 
 
@@ -180,7 +181,7 @@ class EPUBOutput(OutputFormatPlugin):
                 body = body[0]
             # Replace <br> that are children of <body> as ADE doesn't handle them
             if hasattr(body, 'xpath'):
-                for br in body.xpath('./h:br'):
+                for br in XPath('./h:br')(body):
                     if br.getparent() is None:
                         continue
                     try:
@@ -204,29 +205,29 @@ class EPUBOutput(OutputFormatPlugin):
 
 
             if self.opts.output_profile.remove_object_tags:
-                for tag in root.xpath('//h:embed'):
+                for tag in XPath('//h:embed')(root):
                     tag.getparent().remove(tag)
-                for tag in root.xpath('//h:object'):
+                for tag in XPath('//h:object')(root):
                     if tag.get('type', '').lower().strip() in ('image/svg+xml',):
                         continue
                     tag.getparent().remove(tag)
 
-            for tag in root.xpath('//h:title|//h:style'):
+            for tag in XPath('//h:title|//h:style')(root):
                 if not tag.text:
                     tag.getparent().remove(tag)
-            for tag in root.xpath('//h:script'):
+            for tag in XPath('//h:script')(root):
                 if not tag.text and not tag.get('src', False):
                     tag.getparent().remove(tag)
 
-            for tag in root.xpath('//h:form'):
+            for tag in XPath('//h:form')(root):
                 tag.getparent().remove(tag)
 
-            for tag in root.xpath('//h:center'):
+            for tag in XPath('//h:center')(root):
                 tag.tag = XHTML('div')
                 tag.set('style', 'text-align:center')
 
             # ADE can't handle &amp; in an img url
-            for tag in self.root.xpath('//h:img[@src]'):
+            for tag in XPath('//h:img[@src]')(root):
                 tag.set('src', tag.get('src', '').replace('&', ''))
 
             stylesheet = self.oeb.manifest.hrefs['stylesheet.css']
diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index 38de3476d1..25b4114cc2 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -298,6 +298,11 @@ class MobiReader(object):
 
         self.log.debug('Parsing HTML...')
         root = html.fromstring(self.processed_html)
+        if root.xpath('descendant::p/descendant::p'):
+            from lxml.html import soupparser
+            self.log.warning('Markup contains unclosed <p> tags, parsing using',
+                'BeatifulSoup')
+            root = soupparser.fromstring(self.processed_html)
         self.upshift_markup(root)
         guides = root.xpath('//guide')
         guide = guides[0] if guides else None
diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py
index dc7123446b..aaeba67d80 100644
--- a/src/calibre/ebooks/oeb/transforms/guide.py
+++ b/src/calibre/ebooks/oeb/transforms/guide.py
@@ -14,10 +14,6 @@ class Clean(object):
         from calibre.ebooks.oeb.base import urldefrag
         self.oeb, self.log, self.opts = oeb, oeb.log, opts
 
-        protected_hrefs = set([])
-        if 'titlepage' in self.oeb.guide:
-            protected_hrefs.add(urldefrag(
-                self.oeb.guide['titlepage'].href)[0])
         if 'cover' not in self.oeb.guide:
             covers = []
             for x in ('other.ms-coverimage-standard',
@@ -35,20 +31,10 @@ class Clean(object):
                     self.log('Choosing %s:%s as the cover'%(ref.type, ref.href))
                 ref.type = 'cover'
                 self.oeb.guide.refs['cover'] = ref
-                protected_hrefs.add(urldefrag(ref.href)[0])
-        else:
-            protected_hrefs.add(urldefrag(self.oeb.guide.refs['cover'].href)[0])
 
         for x in list(self.oeb.guide):
             href = urldefrag(self.oeb.guide[x].href)[0]
             if x.lower() not in ('cover', 'titlepage'):
-                try:
-                    if href not in protected_hrefs:
-                        item = self.oeb.manifest.hrefs[href]
-                        if item not in self.oeb.spine:
-                            self.oeb.manifest.remove(self.oeb.manifest.hrefs[href])
-                except KeyError:
-                    pass
                 self.oeb.guide.remove(x)