From b839cc76e127e9b139b60315e65073e389eb8778 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 9 May 2009 01:12:23 -0700
Subject: [PATCH] MOBI Input: Fix regression affecting detection of table of
 contents and metadata embedded in content when converting MOBI files. Also
 fix #2407 (error converting to lrf in 0.5.10)

---
 src/calibre/ebooks/mobi/reader.py | 49 ++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 14 deletions(-)
diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py
index 44d3fbd9f7..de396f4966 100644
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@@ -276,19 +276,6 @@ class MobiReader(object):
         self.replace_page_breaks()
         self.cleanup_html()
 
-        if self.processed_html.startswith('<body'):
-            self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
-        self.processed_html = \
-            re.compile('<head>', re.IGNORECASE).sub(
-                '\n<head>\n'
-                '<style type="text/css">\n'
-                'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
-                'p { margin: 0em; text-align: justify; }\n'
-                '.bold { font-weight: bold; }\n'
-                '.italic { font-style: italic; }\n'
-                '</style>\n',
-                self.processed_html)
-
         if self.verbose:
             print 'Parsing HTML...'
         root = html.fromstring(self.processed_html)
@@ -296,7 +283,7 @@ class MobiReader(object):
             from lxml.html import soupparser
             print 'Markup contains unclosed <p> tags, parsing using BeatifulSoup'
             root = soupparser.fromstring(self.processed_html)
-        if root[0].tag != 'html':
+        if root.tag != 'html':
             nroot = html.fromstring('<html><head></head><body></body></html>')
             bod = nroot.find('body')
             for child in list(root):
@@ -304,6 +291,40 @@ class MobiReader(object):
                 bod.append(child)
             root = nroot
 
+        htmls = list(root.xpath('//html'))
+        if len(htmls) > 1:
+            print 'Markup contains multiple <html> tags'
+            # Keep only the largest head and body
+            bodies, heads = root.xpath('//body'), root.xpath('//head')
+            def sz(x): return len(list(x.iter()))
+            def scmp(x, y): return cmp(sz(x), sz(y))
+            body = list(sorted(bodies, cmp=scmp))
+            head = list(sorted(heads, cmp=scmp))
+            for x in root: root.remove(x)
+            if head:
+                root.append(head[-1])
+            if body:
+                root.append(body[-1])
+        for x in root.xpath('//script'):
+            x.getparent().remove(x)
+
+        head = root.xpath('//head')
+        if head:
+            head = head[0]
+        else:
+            head = root.makeelement('head', {})
+            root.insert(0, head)
+        head.text = '\n\t'
+        style = head.makeelement('style', {'type':'text/css'})
+        head.insert(0, style)
+        style.tail = '\n\t'
+        style.text = '''
+                blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
+                p { margin: 0em; text-align: justify }
+                .bold { font-weight: bold }
+                .italic { font-style: italic }
+        '''
+
         self.upshift_markup(root)
         guides = root.xpath('//guide')
         guide = guides[0] if guides else None