MOBI Input: Fix regression affecting detection of table of contents and metadata embedded in content when converting MOBI files. Also fix #2407 (error converting to lrf in 0.5.10)

2025-07-09 03:04:10 -04:00 · 2009-05-09 01:12:23 -07:00 · 2009-05-09 01:12:23 -07:00 · b839cc76e1
commit b839cc76e1
parent 1c0f55e30c
1 changed files with 35 additions and 14 deletions
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -276,19 +276,6 @@ class MobiReader(object):
        self.replace_page_breaks()
        self.cleanup_html()

-        if self.processed_html.startswith('<body'):
-            self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
-        self.processed_html = \
-            re.compile('<head>', re.IGNORECASE).sub(
-                '\n<head>\n'
-                '<style type="text/css">\n'
-                'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
-                'p { margin: 0em; text-align: justify; }\n'
-                '.bold { font-weight: bold; }\n'
-                '.italic { font-style: italic; }\n'
-                '</style>\n',
-                self.processed_html)
-
        if self.verbose:
            print 'Parsing HTML...'
        root = html.fromstring(self.processed_html)
@ -296,7 +283,7 @@ class MobiReader(object):
            from lxml.html import soupparser
            print 'Markup contains unclosed <p> tags, parsing using BeatifulSoup'
            root = soupparser.fromstring(self.processed_html)
-        if root[0].tag != 'html':
+        if root.tag != 'html':
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
@ -304,6 +291,40 @@ class MobiReader(object):
                bod.append(child)
            root = nroot

+        htmls = list(root.xpath('//html'))
+        if len(htmls) > 1:
+            print 'Markup contains multiple <html> tags'
+            # Keep only the largest head and body
+            bodies, heads = root.xpath('//body'), root.xpath('//head')
+            def sz(x): return len(list(x.iter()))
+            def scmp(x, y): return cmp(sz(x), sz(y))
+            body = list(sorted(bodies, cmp=scmp))
+            head = list(sorted(heads, cmp=scmp))
+            for x in root: root.remove(x)
+            if head:
+                root.append(head[-1])
+            if body:
+                root.append(body[-1])
+        for x in root.xpath('//script'):
+            x.getparent().remove(x)
+
+        head = root.xpath('//head')
+        if head:
+            head = head[0]
+        else:
+            head = root.makeelement('head', {})
+            root.insert(0, head)
+        head.text = '\n\t'
+        style = head.makeelement('style', {'type':'text/css'})
+        head.insert(0, style)
+        style.tail = '\n\t'
+        style.text = '''
+                blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
+                p { margin: 0em; text-align: justify }
+                .bold { font-weight: bold }
+                .italic { font-style: italic }
+        '''
+
        self.upshift_markup(root)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None