Various stability improvements.

This commit is contained in:
Marshall T. Vandegrift 2009-01-04 23:30:03 -05:00
parent d11347331c
commit 8348264198

View File

@ -15,10 +15,12 @@ from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
import logging import logging
import re import re
from htmlentitydefs import entitydefs
import uuid
from lxml import etree from lxml import etree
from calibre import LoggingInterface from calibre import LoggingInterface
XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False) XML_PARSER = etree.XMLParser(recover=True)
XML_NS = 'http://www.w3.org/XML/1998/namespace' XML_NS = 'http://www.w3.org/XML/1998/namespace'
XHTML_NS = 'http://www.w3.org/1999/xhtml' XHTML_NS = 'http://www.w3.org/1999/xhtml'
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
@ -29,15 +31,18 @@ DC11_NS = 'http://purl.org/dc/elements/1.1/'
XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
DCTERMS_NS = 'http://purl.org/dc/terms/' DCTERMS_NS = 'http://purl.org/dc/terms/'
NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
SVG_NS = 'http://www.w3.org/2000/svg'
XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS} 'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
'svg': SVG_NS}
def XML(name): return '{%s}%s' % (XML_NS, name) def XML(name): return '{%s}%s' % (XML_NS, name)
def XHTML(name): return '{%s}%s' % (XHTML_NS, name) def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
def OPF(name): return '{%s}%s' % (OPF2_NS, name) def OPF(name): return '{%s}%s' % (OPF2_NS, name)
def DC(name): return '{%s}%s' % (DC11_NS, name) def DC(name): return '{%s}%s' % (DC11_NS, name)
def NCX(name): return '{%s}%s' % (NCX_NS, name) def NCX(name): return '{%s}%s' % (NCX_NS, name)
def SVG(name): return '{%s}%s' % (SVG_NS, name)
EPUB_MIME = 'application/epub+zip' EPUB_MIME = 'application/epub+zip'
XHTML_MIME = 'application/xhtml+xml' XHTML_MIME = 'application/xhtml+xml'
@ -47,9 +52,15 @@ OPF_MIME = 'application/oebps-package+xml'
OEB_DOC_MIME = 'text/x-oeb1-document' OEB_DOC_MIME = 'text/x-oeb1-document'
OEB_CSS_MIME = 'text/x-oeb1-css' OEB_CSS_MIME = 'text/x-oeb1-css'
OPENTYPE_MIME = 'font/opentype' OPENTYPE_MIME = 'font/opentype'
GIF_MIME = 'image/gif'
JPEG_MIME = 'image/jpeg'
PNG_MIME = 'image/png'
SVG_MIME = 'image/svg+xml'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])
OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard' MS_COVER_TYPE = 'other.ms-coverimage-standard'
@ -102,6 +113,9 @@ def urlnormalize(href):
return urlunparse(parts) return urlunparse(parts)
class OEBError(Exception):
pass
class FauxLogger(object): class FauxLogger(object):
def __getattr__(self, name): def __getattr__(self, name):
return self return self
@ -162,8 +176,9 @@ class Metadata(object):
'xsi': XSI_NS} 'xsi': XSI_NS}
class Item(object): class Item(object):
def __init__(self, term, value, fq_attrib={}): def __init__(self, term, value, fq_attrib={}, **kwargs):
self.fq_attrib = dict(fq_attrib) self.fq_attrib = fq_attrib = dict(fq_attrib)
fq_attrib.update(kwargs)
if term == OPF('meta') and not value: if term == OPF('meta') and not value:
term = self.fq_attrib.pop('name') term = self.fq_attrib.pop('name')
value = self.fq_attrib.pop('content') value = self.fq_attrib.pop('content')
@ -225,8 +240,8 @@ class Metadata(object):
self.oeb = oeb self.oeb = oeb
self.items = defaultdict(list) self.items = defaultdict(list)
def add(self, term, value, attrib={}): def add(self, term, value, attrib={}, **kwargs):
item = self.Item(term, value, attrib) item = self.Item(term, value, attrib, **kwargs)
items = self.items[barename(item.term)] items = self.items[barename(item.term)]
items.append(item) items.append(item)
return item return item
@ -267,6 +282,7 @@ class Metadata(object):
class Manifest(object): class Manifest(object):
class Item(object): class Item(object):
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
def __init__(self, id, href, media_type, def __init__(self, id, href, media_type,
@ -284,19 +300,25 @@ class Manifest(object):
return 'Item(id=%r, href=%r, media_type=%r)' \ return 'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _force_xhtml(self, data):
repl = lambda m: entitydefs.get(m.group(1), m.group(0))
data = self.ENTITY_RE.sub(repl, data)
data = etree.fromstring(data, parser=XML_PARSER)
if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data)
data = etree.fromstring(data, parser=XML_PARSER)
return data
def data(): def data():
def fget(self): def fget(self):
if self._data is not None: if self._data is not None:
return self._data return self._data
data = self._loader(self.href) data = self._loader(self.href)
if self.media_type in OEB_DOCS: if self.media_type in OEB_DOCS:
data = etree.fromstring(data, parser=XML_PARSER) data = self._force_xhtml(data)
if namespace(data.tag) != XHTML_NS: elif self.media_type[-4:] in ('+xml', '/xml') \
data.attrib['xmlns'] = XHTML_NS and self.media_type != SVG_MIME:
data = etree.tostring(data)
data = etree.fromstring(data, parser=XML_PARSER)
elif self.media_type.startswith('application/') \
and self.media_type.endswith('+xml'):
data = etree.fromstring(data, parser=XML_PARSER) data = etree.fromstring(data, parser=XML_PARSER)
self._data = data self._data = data
return data return data
@ -636,13 +658,22 @@ class OEBBook(object):
self._all_from_opf(opf) self._all_from_opf(opf)
def _convert_opf1(self, opf): def _convert_opf1(self, opf):
# Seriously, seriously wrong
if namespace(opf.tag) == OPF1_NS:
opf.tag = barename(opf.tag)
for elem in opf.iterdescendants():
if isinstance(elem.tag, basestring) \
and namespace(elem.tag) == OPF1_NS:
elem.tag = barename(elem.tag)
attrib = dict(opf.attrib)
attrib['version'] = '2.0'
nroot = etree.Element(OPF('package'), nroot = etree.Element(OPF('package'),
nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib)) nsmap={None: OPF2_NS}, attrib=attrib)
metadata = etree.SubElement(nroot, OPF('metadata'), metadata = etree.SubElement(nroot, OPF('metadata'),
nsmap={'opf': OPF2_NS, 'dc': DC11_NS, nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
'xsi': XSI_NS, 'dcterms': DCTERMS_NS}) 'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
for prefix in ('d11', 'd10', 'd09'): for prefix in ('d11', 'd10', 'd09'):
elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix) elements = xpath(opf, 'metadata//%s:*' % prefix)
if elements: break if elements: break
for element in elements: for element in elements:
if not element.text: continue if not element.text: continue
@ -654,7 +685,7 @@ class OEBBook(object):
element.attrib[nsname] = element.attrib[name] element.attrib[nsname] = element.attrib[name]
del element.attrib[name] del element.attrib[name]
metadata.append(element) metadata.append(element)
for element in opf.xpath('metadata/x-metadata/meta'): for element in opf.xpath('metadata//meta'):
metadata.append(element) metadata.append(element)
for item in opf.xpath('manifest/item'): for item in opf.xpath('manifest/item'):
media_type = item.attrib['media-type'].lower() media_type = item.attrib['media-type'].lower()
@ -671,23 +702,40 @@ class OEBBook(object):
def _read_opf(self, opfpath): def _read_opf(self, opfpath):
opf = self.container.read_xml(opfpath) opf = self.container.read_xml(opfpath)
version = float(opf.get('version', 1.0)) version = float(opf.get('version', 1.0))
if version < 2.0: ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS):
raise OEBError('Invalid namespace %r for OPF document' % ns)
if ns != OPF2_NS or version < 2.0:
opf = self._convert_opf1(opf) opf = self._convert_opf1(opf)
return opf return opf
def _metadata_from_opf(self, opf): def _metadata_from_opf(self, opf):
uid = opf.attrib['unique-identifier'] uid = opf.get('unique-identifier', 'calibre-uuid')
self.uid = None
self.metadata = metadata = Metadata(self) self.metadata = metadata = Metadata(self)
for elem in xpath(opf, '/o2:package/o2:metadata/*'): ignored = (OPF('dc-metadata'), OPF('x-metadata'))
if elem.text or elem.attrib: for elem in xpath(opf, '/o2:package/o2:metadata//*'):
if elem.tag not in ignored and (elem.text or elem.attrib):
metadata.add(elem.tag, elem.text, elem.attrib) metadata.add(elem.tag, elem.text, elem.attrib)
haveuuid = haveid = False
for ident in metadata.identifier:
if unicode(ident).startswith('urn:uuid:'):
haveuuid = True
if 'id' in ident.attrib:
haveid = True
if not haveuuid and haveid:
bookid = "urn:uuid:%s" % str(uuid.uuid4())
metadata.add('identifier', bookid, id='calibre-uuid')
for item in metadata.identifier: for item in metadata.identifier:
if item.id == uid: if item.id == uid:
self.uid = item self.uid = item
break break
else: else:
self.logger.log_warn(u'Unique-identifier %r not found.' % uid) self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
self.uid = metadata.identifier[0] for ident in metadata.identifier:
if 'id' in ident.attrib:
self.uid = metadata.identifier[0]
break
if not metadata.language: if not metadata.language:
self.logger.log_warn(u'Language not specified.') self.logger.log_warn(u'Language not specified.')
metadata.add('language', 'en') metadata.add('language', 'en')