From 8348264198096d30eed1b2218b98a4812e04d28e Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 4 Jan 2009 23:30:03 -0500 Subject: [PATCH] Various stability improvements. --- src/calibre/ebooks/oeb/base.py | 92 ++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 5326e37a47..c736c4ad98 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,10 +15,12 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote import logging import re +from htmlentitydefs import entitydefs +import uuid from lxml import etree from calibre import LoggingInterface -XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False) +XML_PARSER = etree.XMLParser(recover=True) XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' @@ -29,15 +31,18 @@ DC11_NS = 'http://purl.org/dc/elements/1.1/' XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' DCTERMS_NS = 'http://purl.org/dc/terms/' NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' +SVG_NS = 'http://www.w3.org/2000/svg' XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, - 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} + 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS, + 'svg': SVG_NS} def XML(name): return '{%s}%s' % (XML_NS, name) def XHTML(name): return '{%s}%s' % (XHTML_NS, name) def OPF(name): return '{%s}%s' % (OPF2_NS, name) def DC(name): return '{%s}%s' % (DC11_NS, name) def NCX(name): return '{%s}%s' % (NCX_NS, name) +def SVG(name): return '{%s}%s' % (SVG_NS, name) EPUB_MIME = 'application/epub+zip' XHTML_MIME = 'application/xhtml+xml' @@ -47,9 +52,15 @@ OPF_MIME = 'application/oebps-package+xml' OEB_DOC_MIME = 'text/x-oeb1-document' OEB_CSS_MIME = 'text/x-oeb1-css' OPENTYPE_MIME = 'font/opentype' +GIF_MIME = 'image/gif' +JPEG_MIME = 'image/jpeg' +PNG_MIME = 'image/png' +SVG_MIME = 'image/svg+xml' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) +OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME]) +OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) MS_COVER_TYPE = 'other.ms-coverimage-standard' @@ -102,6 +113,9 @@ def urlnormalize(href): return urlunparse(parts) +class OEBError(Exception): + pass + class FauxLogger(object): def __getattr__(self, name): return self @@ -162,8 +176,9 @@ class Metadata(object): 'xsi': XSI_NS} class Item(object): - def __init__(self, term, value, fq_attrib={}): - self.fq_attrib = dict(fq_attrib) + def __init__(self, term, value, fq_attrib={}, **kwargs): + self.fq_attrib = fq_attrib = dict(fq_attrib) + fq_attrib.update(kwargs) if term == OPF('meta') and not value: term = self.fq_attrib.pop('name') value = self.fq_attrib.pop('content') @@ -225,8 +240,8 @@ class Metadata(object): self.oeb = oeb self.items = defaultdict(list) - def add(self, term, value, attrib={}): - item = self.Item(term, value, attrib) + def add(self, term, value, attrib={}, **kwargs): + item = self.Item(term, value, attrib, **kwargs) items = self.items[barename(item.term)] items.append(item) return item @@ -267,6 +282,7 @@ class Metadata(object): class Manifest(object): class Item(object): + ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') def __init__(self, id, href, media_type, @@ -284,19 +300,25 @@ class Manifest(object): return 'Item(id=%r, href=%r, media_type=%r)' \ % (self.id, self.href, self.media_type) + def _force_xhtml(self, data): + repl = lambda m: entitydefs.get(m.group(1), m.group(0)) + data = self.ENTITY_RE.sub(repl, data) + data = etree.fromstring(data, parser=XML_PARSER) + if namespace(data.tag) != XHTML_NS: + data.attrib['xmlns'] = XHTML_NS + data = etree.tostring(data) + data = etree.fromstring(data, parser=XML_PARSER) + return data + def data(): def fget(self): if self._data is not None: return self._data data = self._loader(self.href) if self.media_type in OEB_DOCS: - data = etree.fromstring(data, parser=XML_PARSER) - if namespace(data.tag) != XHTML_NS: - data.attrib['xmlns'] = XHTML_NS - data = etree.tostring(data) - data = etree.fromstring(data, parser=XML_PARSER) - elif self.media_type.startswith('application/') \ - and self.media_type.endswith('+xml'): + data = self._force_xhtml(data) + elif self.media_type[-4:] in ('+xml', '/xml') \ + and self.media_type != SVG_MIME: data = etree.fromstring(data, parser=XML_PARSER) self._data = data return data @@ -636,13 +658,22 @@ class OEBBook(object): self._all_from_opf(opf) def _convert_opf1(self, opf): + # Seriously, seriously wrong + if namespace(opf.tag) == OPF1_NS: + opf.tag = barename(opf.tag) + for elem in opf.iterdescendants(): + if isinstance(elem.tag, basestring) \ + and namespace(elem.tag) == OPF1_NS: + elem.tag = barename(elem.tag) + attrib = dict(opf.attrib) + attrib['version'] = '2.0' nroot = etree.Element(OPF('package'), - nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib)) + nsmap={None: OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, OPF('metadata'), nsmap={'opf': OPF2_NS, 'dc': DC11_NS, 'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) for prefix in ('d11', 'd10', 'd09'): - elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix) + elements = xpath(opf, 'metadata//%s:*' % prefix) if elements: break for element in elements: if not element.text: continue @@ -654,7 +685,7 @@ class OEBBook(object): element.attrib[nsname] = element.attrib[name] del element.attrib[name] metadata.append(element) - for element in opf.xpath('metadata/x-metadata/meta'): + for element in opf.xpath('metadata//meta'): metadata.append(element) for item in opf.xpath('manifest/item'): media_type = item.attrib['media-type'].lower() @@ -671,23 +702,40 @@ class OEBBook(object): def _read_opf(self, opfpath): opf = self.container.read_xml(opfpath) version = float(opf.get('version', 1.0)) - if version < 2.0: + ns = namespace(opf.tag) + if ns not in ('', OPF1_NS, OPF2_NS): + raise OEBError('Invalid namespace %r for OPF document' % ns) + if ns != OPF2_NS or version < 2.0: opf = self._convert_opf1(opf) return opf def _metadata_from_opf(self, opf): - uid = opf.attrib['unique-identifier'] - self.metadata = metadata = Metadata(self) - for elem in xpath(opf, '/o2:package/o2:metadata/*'): - if elem.text or elem.attrib: + uid = opf.get('unique-identifier', 'calibre-uuid') + self.uid = None + self.metadata = metadata = Metadata(self) + ignored = (OPF('dc-metadata'), OPF('x-metadata')) + for elem in xpath(opf, '/o2:package/o2:metadata//*'): + if elem.tag not in ignored and (elem.text or elem.attrib): metadata.add(elem.tag, elem.text, elem.attrib) + haveuuid = haveid = False + for ident in metadata.identifier: + if unicode(ident).startswith('urn:uuid:'): + haveuuid = True + if 'id' in ident.attrib: + haveid = True + if not haveuuid and haveid: + bookid = "urn:uuid:%s" % str(uuid.uuid4()) + metadata.add('identifier', bookid, id='calibre-uuid') for item in metadata.identifier: if item.id == uid: self.uid = item break else: self.logger.log_warn(u'Unique-identifier %r not found.' % uid) - self.uid = metadata.identifier[0] + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.uid = metadata.identifier[0] + break if not metadata.language: self.logger.log_warn(u'Language not specified.') metadata.add('language', 'en')