MOBI Input: Merge multiple html sections instead of just using the largest one

This commit is contained in:
Kovid Goyal 2009-06-11 16:04:54 -07:00
parent 233c3bcb1b
commit 8d1bcee8dd
3 changed files with 18 additions and 11 deletions

View File

@ -233,7 +233,7 @@ class HTMLInput(InputFormatPlugin):
name = 'HTML Input' name = 'HTML Input'
author = 'Kovid Goyal' author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB' description = 'Convert HTML and OPF files to an OEB'
file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm']) file_types = set(['opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'])
options = set([ options = set([
OptionRecommendation(name='breadth_first', OptionRecommendation(name='breadth_first',

View File

@ -315,18 +315,24 @@ class MobiReader(object):
htmls = list(root.xpath('//html')) htmls = list(root.xpath('//html'))
if len(htmls) > 1: if len(htmls) > 1:
self.log.warn('Markup contains multiple <html> tags') self.log.warn('Markup contains multiple <html> tags, merging.')
# Keep only the largest head and body # Merge all <head> and <body> sections
for h in htmls:
p = h.getparent()
if hasattr(p, 'remove'):
p.remove(h)
bodies, heads = root.xpath('//body'), root.xpath('//head') bodies, heads = root.xpath('//body'), root.xpath('//head')
def sz(x): return len(list(x.iter()))
def scmp(x, y): return cmp(sz(x), sz(y))
body = list(sorted(bodies, cmp=scmp))
head = list(sorted(heads, cmp=scmp))
for x in root: root.remove(x) for x in root: root.remove(x)
if head: head, body = map(root.makeelement, ('head', 'body'))
root.append(head[-1]) for h in heads:
if body: for x in h:
root.append(body[-1]) h.remove(x)
head.append(x)
for b in bodies:
for x in b:
b.remove(x)
body.append(x)
root.append(head), root.append(body)
for x in root.xpath('//script'): for x in root.xpath('//script'):
x.getparent().remove(x) x.getparent().remove(x)

View File

@ -823,6 +823,7 @@ class Manifest(object):
for key in list(body.attrib.keys()): for key in list(body.attrib.keys()):
if key == 'lang' or key.endswith('}lang'): if key == 'lang' or key.endswith('}lang'):
body.attrib.pop(key) body.attrib.pop(key)
return data return data
def _parse_css(self, data): def _parse_css(self, data):