MOBI Input: Fix regression affecting detection of table of contents and metadata embedded in content when converting MOBI files. Also fix #2407 (error converting to lrf in 0.5.10)

2025-07-09 03:04:10 -04:00 · 2009-05-09 01:12:23 -07:00 · 2009-05-09 01:12:23 -07:00 · b839cc76e1
commit b839cc76e1
parent 1c0f55e30c
1 changed files with 35 additions and 14 deletions
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -276,19 +276,6 @@ class MobiReader(object):
        self.replace_page_breaks()
        self.cleanup_html()
        if self.processed_html.startswith('<body'):
            self.processed_html = '<html><head></head>'+self.processed_html+'</html>'
        self.processed_html = \
            re.compile('<head>', re.IGNORECASE).sub(
                '\n<head>\n'
                '<style type="text/css">\n'
                'blockquote { margin: 0em 0em 0em 1.25em; text-align: justify; }\n'
                'p { margin: 0em; text-align: justify; }\n'
                '.bold { font-weight: bold; }\n'
                '.italic { font-style: italic; }\n'
                '</style>\n',
                self.processed_html)
        if self.verbose:
            print 'Parsing HTML...'
        root = html.fromstring(self.processed_html)
@ -296,7 +283,7 @@ class MobiReader(object):
            from lxml.html import soupparser
            print 'Markup contains unclosed <p> tags, parsing using BeatifulSoup'
            root = soupparser.fromstring(self.processed_html)
-        if root[0].tag != 'html':
+        if root.tag != 'html':
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
@ -304,6 +291,40 @@ class MobiReader(object):
                bod.append(child)
            root = nroot
        htmls = list(root.xpath('//html'))
        if len(htmls) > 1:
            print 'Markup contains multiple <html> tags'
            # Keep only the largest head and body
            bodies, heads = root.xpath('//body'), root.xpath('//head')
            def sz(x): return len(list(x.iter()))
            def scmp(x, y): return cmp(sz(x), sz(y))
            body = list(sorted(bodies, cmp=scmp))
            head = list(sorted(heads, cmp=scmp))
            for x in root: root.remove(x)
            if head:
                root.append(head[-1])
            if body:
                root.append(body[-1])
        for x in root.xpath('//script'):
            x.getparent().remove(x)
        head = root.xpath('//head')
        if head:
            head = head[0]
        else:
            head = root.makeelement('head', {})
            root.insert(0, head)
        head.text = '\n\t'
        style = head.makeelement('style', {'type':'text/css'})
        head.insert(0, style)
        style.tail = '\n\t'
        style.text = '''
                blockquote { margin: 0em 0em 0em 1.25em; text-align: justify }
                p { margin: 0em; text-align: justify }
                .bold { font-weight: bold }
                .italic { font-style: italic }
        '''
        self.upshift_markup(root)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None