Various stability fixes for any2lit and any2mobi

This commit is contained in:
Kovid Goyal 2009-01-26 09:37:57 -08:00
commit 2823867347
4 changed files with 38 additions and 4 deletions

View File

@ -11,6 +11,7 @@ import sys, struct, cStringIO, os
import functools
import re
from urlparse import urldefrag
from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
@ -611,6 +612,8 @@ class LitReader(object):
offset, raw = u32(raw), raw[4:]
internal, raw = consume_sized_utf8_string(raw)
original, raw = consume_sized_utf8_string(raw)
# The path should be stored unquoted, but not always
original = urlunquote(original)
# Is this last one UTF-8 or ASCIIZ?
mime_type, raw = consume_sized_utf8_string(raw, zpad=True)
self.manifest[internal] = ManifestItem(

View File

@ -331,6 +331,13 @@ class Manifest(object):
def _force_xhtml(self, data):
if self.oeb.encoding is not None:
data = data.decode(self.oeb.encoding, 'replace')
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data:
data = data.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in data and XLINK_NS not in data:
data = data.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
try:
data = etree.fromstring(data)
except etree.XMLSyntaxError:
@ -343,6 +350,24 @@ class Manifest(object):
data = etree.fromstring(data)
for meta in self.META_XP(data):
meta.getparent().remove(meta)
head = xpath(data, '/h:html/h:head')
head = head[0] if head else None
if head is None:
self.oeb.logger.warn(
'File %r missing <head/> element' % self.href)
head = etree.Element(XHTML('head'))
data.insert(0, head)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
elif not xpath(data, '/h:html/h:head/h:title'):
self.oeb.logger.warn(
'File %r missing <title/> element' % self.href)
title = etree.SubElement(head, XHTML('title'))
title.text = self.oeb.translate(__('Unknown'))
if not xpath(data, '/h:html/h:body'):
self.oeb.logger.warn(
'File %r missing <body/> element' % self.href)
etree.SubElement(data, XHTML('body'))
return data
def data():

View File

@ -110,7 +110,8 @@ class Stylizer(object):
def __init__(self, tree, path, oeb, profile=PROFILES['PRS505']):
self.profile = profile
base = os.path.dirname(path)
self.logger = oeb.logger
item = oeb.manifest.hrefs[path]
basename = os.path.basename(path)
cssname = os.path.splitext(basename)[0] + '.css'
stylesheets = [HTML_CSS_STYLESHEET]
@ -128,8 +129,12 @@ class Stylizer(object):
and elem.get('rel', 'stylesheet') == 'stylesheet' \
and elem.get('type', CSS_MIME) in OEB_STYLES:
href = urlnormalize(elem.attrib['href'])
path = os.path.join(base, href)
path = os.path.normpath(path).replace('\\', '/')
path = item.abshref(href)
if path not in oeb.manifest.hrefs:
self.logger.warn(
'Stylesheet %r referenced by file %r not in manifest' %
(path, item.href))
continue
if path in self.STYLESHEETS:
stylesheet = self.STYLESHEETS[path]
else:

View File

@ -13,6 +13,7 @@ from urlparse import urldefrag
from lxml import etree
import cssutils
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS
from calibre.ebooks.oeb.base import urlnormalize
LINK_SELECTORS = []
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
@ -46,7 +47,7 @@ class ManifestTrimmer(object):
item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs):
href = item.abshref(href)
href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used: