Fix #1649 (2). Yet more handling for broken (X)HTML.

This commit is contained in:
Marshall T. Vandegrift 2009-01-26 08:47:58 -05:00
parent 3b5b9f1849
commit 0350cd79e3
2 changed files with 20 additions and 1 deletions

View File

@ -350,6 +350,24 @@ class Manifest(object):
data = etree.fromstring(data) data = etree.fromstring(data)
for meta in self.META_XP(data): for meta in self.META_XP(data):
meta.getparent().remove(meta) meta.getparent().remove(meta)
head = xpath(data, '/h:html/h:head')
head = head[0] if head else None
if head is None:
self.oeb.logger.warn(
'File %r missing <head/> element' % self.href)
head = etree.Element(XHTML('head'))
data.insert(0, head)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
elif not xpath(data, '/h:html/h:head/h:title'):
self.oeb.logger.warn(
'File %r missing <title/> element' % self.href)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
if not xpath(data, '/h:html/h:body'):
self.oeb.logger.warn(
'File %r missing <body/> element' % self.href)
etree.SubElement(data, XHTML('body'))
return data return data
def data(): def data():

View File

@ -13,6 +13,7 @@ from urlparse import urldefrag
from lxml import etree from lxml import etree
import cssutils import cssutils
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS
from calibre.ebooks.oeb.base import urlnormalize
LINK_SELECTORS = [] LINK_SELECTORS = []
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data', for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
@ -46,7 +47,7 @@ class ManifestTrimmer(object):
item.data is not None: item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS] hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs): for href in chain(*hrefs):
href = item.abshref(href) href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs: if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href] found = oeb.manifest.hrefs[href]
if found not in used: if found not in used: