From b839cc76e127e9b139b60315e65073e389eb8778 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 9 May 2009 01:12:23 -0700 Subject: [PATCH] MOBI Input: Fix regression affecting detection of table of contents and metadata embedded in content when converting MOBI files. Also fix #2407 (error converting to lrf in 0.5.10) --- src/calibre/ebooks/mobi/reader.py | 49 ++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 44d3fbd9f7..de396f4966 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -276,19 +276,6 @@ class MobiReader(object): self.replace_page_breaks() self.cleanup_html() - if self.processed_html.startswith('' - self.processed_html = \ - re.compile('', re.IGNORECASE).sub( - '\n\n' - '\n', - self.processed_html) - if self.verbose: print 'Parsing HTML...' root = html.fromstring(self.processed_html) @@ -296,7 +283,7 @@ class MobiReader(object): from lxml.html import soupparser print 'Markup contains unclosed

tags, parsing using BeatifulSoup' root = soupparser.fromstring(self.processed_html) - if root[0].tag != 'html': + if root.tag != 'html': nroot = html.fromstring('') bod = nroot.find('body') for child in list(root): @@ -304,6 +291,40 @@ class MobiReader(object): bod.append(child) root = nroot + htmls = list(root.xpath('//html')) + if len(htmls) > 1: + print 'Markup contains multiple tags' + # Keep only the largest head and body + bodies, heads = root.xpath('//body'), root.xpath('//head') + def sz(x): return len(list(x.iter())) + def scmp(x, y): return cmp(sz(x), sz(y)) + body = list(sorted(bodies, cmp=scmp)) + head = list(sorted(heads, cmp=scmp)) + for x in root: root.remove(x) + if head: + root.append(head[-1]) + if body: + root.append(body[-1]) + for x in root.xpath('//script'): + x.getparent().remove(x) + + head = root.xpath('//head') + if head: + head = head[0] + else: + head = root.makeelement('head', {}) + root.insert(0, head) + head.text = '\n\t' + style = head.makeelement('style', {'type':'text/css'}) + head.insert(0, style) + style.tail = '\n\t' + style.text = ''' + blockquote { margin: 0em 0em 0em 1.25em; text-align: justify } + p { margin: 0em; text-align: justify } + .bold { font-weight: bold } + .italic { font-style: italic } + ''' + self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None