mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Various stability improvements.
This commit is contained in:
parent
d11347331c
commit
8348264198
@ -15,10 +15,12 @@ from urlparse import urldefrag, urlparse, urlunparse
|
|||||||
from urllib import unquote as urlunquote
|
from urllib import unquote as urlunquote
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from htmlentitydefs import entitydefs
|
||||||
|
import uuid
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from calibre import LoggingInterface
|
from calibre import LoggingInterface
|
||||||
|
|
||||||
XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False)
|
XML_PARSER = etree.XMLParser(recover=True)
|
||||||
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
||||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
|
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
|
||||||
@ -29,15 +31,18 @@ DC11_NS = 'http://purl.org/dc/elements/1.1/'
|
|||||||
XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
|
XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
|
||||||
DCTERMS_NS = 'http://purl.org/dc/terms/'
|
DCTERMS_NS = 'http://purl.org/dc/terms/'
|
||||||
NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
|
NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
|
||||||
|
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||||
XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
|
XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
|
||||||
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
|
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
|
||||||
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS}
|
'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
|
||||||
|
'svg': SVG_NS}
|
||||||
|
|
||||||
def XML(name): return '{%s}%s' % (XML_NS, name)
|
def XML(name): return '{%s}%s' % (XML_NS, name)
|
||||||
def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
|
def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
|
||||||
def OPF(name): return '{%s}%s' % (OPF2_NS, name)
|
def OPF(name): return '{%s}%s' % (OPF2_NS, name)
|
||||||
def DC(name): return '{%s}%s' % (DC11_NS, name)
|
def DC(name): return '{%s}%s' % (DC11_NS, name)
|
||||||
def NCX(name): return '{%s}%s' % (NCX_NS, name)
|
def NCX(name): return '{%s}%s' % (NCX_NS, name)
|
||||||
|
def SVG(name): return '{%s}%s' % (SVG_NS, name)
|
||||||
|
|
||||||
EPUB_MIME = 'application/epub+zip'
|
EPUB_MIME = 'application/epub+zip'
|
||||||
XHTML_MIME = 'application/xhtml+xml'
|
XHTML_MIME = 'application/xhtml+xml'
|
||||||
@ -47,9 +52,15 @@ OPF_MIME = 'application/oebps-package+xml'
|
|||||||
OEB_DOC_MIME = 'text/x-oeb1-document'
|
OEB_DOC_MIME = 'text/x-oeb1-document'
|
||||||
OEB_CSS_MIME = 'text/x-oeb1-css'
|
OEB_CSS_MIME = 'text/x-oeb1-css'
|
||||||
OPENTYPE_MIME = 'font/opentype'
|
OPENTYPE_MIME = 'font/opentype'
|
||||||
|
GIF_MIME = 'image/gif'
|
||||||
|
JPEG_MIME = 'image/jpeg'
|
||||||
|
PNG_MIME = 'image/png'
|
||||||
|
SVG_MIME = 'image/svg+xml'
|
||||||
|
|
||||||
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
|
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
|
||||||
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
|
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
|
||||||
|
OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])
|
||||||
|
OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
|
||||||
|
|
||||||
MS_COVER_TYPE = 'other.ms-coverimage-standard'
|
MS_COVER_TYPE = 'other.ms-coverimage-standard'
|
||||||
|
|
||||||
@ -102,6 +113,9 @@ def urlnormalize(href):
|
|||||||
return urlunparse(parts)
|
return urlunparse(parts)
|
||||||
|
|
||||||
|
|
||||||
|
class OEBError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class FauxLogger(object):
|
class FauxLogger(object):
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
return self
|
return self
|
||||||
@ -162,8 +176,9 @@ class Metadata(object):
|
|||||||
'xsi': XSI_NS}
|
'xsi': XSI_NS}
|
||||||
|
|
||||||
class Item(object):
|
class Item(object):
|
||||||
def __init__(self, term, value, fq_attrib={}):
|
def __init__(self, term, value, fq_attrib={}, **kwargs):
|
||||||
self.fq_attrib = dict(fq_attrib)
|
self.fq_attrib = fq_attrib = dict(fq_attrib)
|
||||||
|
fq_attrib.update(kwargs)
|
||||||
if term == OPF('meta') and not value:
|
if term == OPF('meta') and not value:
|
||||||
term = self.fq_attrib.pop('name')
|
term = self.fq_attrib.pop('name')
|
||||||
value = self.fq_attrib.pop('content')
|
value = self.fq_attrib.pop('content')
|
||||||
@ -225,8 +240,8 @@ class Metadata(object):
|
|||||||
self.oeb = oeb
|
self.oeb = oeb
|
||||||
self.items = defaultdict(list)
|
self.items = defaultdict(list)
|
||||||
|
|
||||||
def add(self, term, value, attrib={}):
|
def add(self, term, value, attrib={}, **kwargs):
|
||||||
item = self.Item(term, value, attrib)
|
item = self.Item(term, value, attrib, **kwargs)
|
||||||
items = self.items[barename(item.term)]
|
items = self.items[barename(item.term)]
|
||||||
items.append(item)
|
items.append(item)
|
||||||
return item
|
return item
|
||||||
@ -267,6 +282,7 @@ class Metadata(object):
|
|||||||
|
|
||||||
class Manifest(object):
|
class Manifest(object):
|
||||||
class Item(object):
|
class Item(object):
|
||||||
|
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
|
||||||
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
|
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
|
||||||
|
|
||||||
def __init__(self, id, href, media_type,
|
def __init__(self, id, href, media_type,
|
||||||
@ -284,19 +300,25 @@ class Manifest(object):
|
|||||||
return 'Item(id=%r, href=%r, media_type=%r)' \
|
return 'Item(id=%r, href=%r, media_type=%r)' \
|
||||||
% (self.id, self.href, self.media_type)
|
% (self.id, self.href, self.media_type)
|
||||||
|
|
||||||
|
def _force_xhtml(self, data):
|
||||||
|
repl = lambda m: entitydefs.get(m.group(1), m.group(0))
|
||||||
|
data = self.ENTITY_RE.sub(repl, data)
|
||||||
|
data = etree.fromstring(data, parser=XML_PARSER)
|
||||||
|
if namespace(data.tag) != XHTML_NS:
|
||||||
|
data.attrib['xmlns'] = XHTML_NS
|
||||||
|
data = etree.tostring(data)
|
||||||
|
data = etree.fromstring(data, parser=XML_PARSER)
|
||||||
|
return data
|
||||||
|
|
||||||
def data():
|
def data():
|
||||||
def fget(self):
|
def fget(self):
|
||||||
if self._data is not None:
|
if self._data is not None:
|
||||||
return self._data
|
return self._data
|
||||||
data = self._loader(self.href)
|
data = self._loader(self.href)
|
||||||
if self.media_type in OEB_DOCS:
|
if self.media_type in OEB_DOCS:
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = self._force_xhtml(data)
|
||||||
if namespace(data.tag) != XHTML_NS:
|
elif self.media_type[-4:] in ('+xml', '/xml') \
|
||||||
data.attrib['xmlns'] = XHTML_NS
|
and self.media_type != SVG_MIME:
|
||||||
data = etree.tostring(data)
|
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
|
||||||
elif self.media_type.startswith('application/') \
|
|
||||||
and self.media_type.endswith('+xml'):
|
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = etree.fromstring(data, parser=XML_PARSER)
|
||||||
self._data = data
|
self._data = data
|
||||||
return data
|
return data
|
||||||
@ -636,13 +658,22 @@ class OEBBook(object):
|
|||||||
self._all_from_opf(opf)
|
self._all_from_opf(opf)
|
||||||
|
|
||||||
def _convert_opf1(self, opf):
|
def _convert_opf1(self, opf):
|
||||||
|
# Seriously, seriously wrong
|
||||||
|
if namespace(opf.tag) == OPF1_NS:
|
||||||
|
opf.tag = barename(opf.tag)
|
||||||
|
for elem in opf.iterdescendants():
|
||||||
|
if isinstance(elem.tag, basestring) \
|
||||||
|
and namespace(elem.tag) == OPF1_NS:
|
||||||
|
elem.tag = barename(elem.tag)
|
||||||
|
attrib = dict(opf.attrib)
|
||||||
|
attrib['version'] = '2.0'
|
||||||
nroot = etree.Element(OPF('package'),
|
nroot = etree.Element(OPF('package'),
|
||||||
nsmap={None: OPF2_NS}, version="2.0", **dict(opf.attrib))
|
nsmap={None: OPF2_NS}, attrib=attrib)
|
||||||
metadata = etree.SubElement(nroot, OPF('metadata'),
|
metadata = etree.SubElement(nroot, OPF('metadata'),
|
||||||
nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
|
nsmap={'opf': OPF2_NS, 'dc': DC11_NS,
|
||||||
'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
|
'xsi': XSI_NS, 'dcterms': DCTERMS_NS})
|
||||||
for prefix in ('d11', 'd10', 'd09'):
|
for prefix in ('d11', 'd10', 'd09'):
|
||||||
elements = xpath(opf, 'metadata/dc-metadata/%s:*' % prefix)
|
elements = xpath(opf, 'metadata//%s:*' % prefix)
|
||||||
if elements: break
|
if elements: break
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if not element.text: continue
|
if not element.text: continue
|
||||||
@ -654,7 +685,7 @@ class OEBBook(object):
|
|||||||
element.attrib[nsname] = element.attrib[name]
|
element.attrib[nsname] = element.attrib[name]
|
||||||
del element.attrib[name]
|
del element.attrib[name]
|
||||||
metadata.append(element)
|
metadata.append(element)
|
||||||
for element in opf.xpath('metadata/x-metadata/meta'):
|
for element in opf.xpath('metadata//meta'):
|
||||||
metadata.append(element)
|
metadata.append(element)
|
||||||
for item in opf.xpath('manifest/item'):
|
for item in opf.xpath('manifest/item'):
|
||||||
media_type = item.attrib['media-type'].lower()
|
media_type = item.attrib['media-type'].lower()
|
||||||
@ -671,23 +702,40 @@ class OEBBook(object):
|
|||||||
def _read_opf(self, opfpath):
|
def _read_opf(self, opfpath):
|
||||||
opf = self.container.read_xml(opfpath)
|
opf = self.container.read_xml(opfpath)
|
||||||
version = float(opf.get('version', 1.0))
|
version = float(opf.get('version', 1.0))
|
||||||
if version < 2.0:
|
ns = namespace(opf.tag)
|
||||||
|
if ns not in ('', OPF1_NS, OPF2_NS):
|
||||||
|
raise OEBError('Invalid namespace %r for OPF document' % ns)
|
||||||
|
if ns != OPF2_NS or version < 2.0:
|
||||||
opf = self._convert_opf1(opf)
|
opf = self._convert_opf1(opf)
|
||||||
return opf
|
return opf
|
||||||
|
|
||||||
def _metadata_from_opf(self, opf):
|
def _metadata_from_opf(self, opf):
|
||||||
uid = opf.attrib['unique-identifier']
|
uid = opf.get('unique-identifier', 'calibre-uuid')
|
||||||
self.metadata = metadata = Metadata(self)
|
self.uid = None
|
||||||
for elem in xpath(opf, '/o2:package/o2:metadata/*'):
|
self.metadata = metadata = Metadata(self)
|
||||||
if elem.text or elem.attrib:
|
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
|
||||||
|
for elem in xpath(opf, '/o2:package/o2:metadata//*'):
|
||||||
|
if elem.tag not in ignored and (elem.text or elem.attrib):
|
||||||
metadata.add(elem.tag, elem.text, elem.attrib)
|
metadata.add(elem.tag, elem.text, elem.attrib)
|
||||||
|
haveuuid = haveid = False
|
||||||
|
for ident in metadata.identifier:
|
||||||
|
if unicode(ident).startswith('urn:uuid:'):
|
||||||
|
haveuuid = True
|
||||||
|
if 'id' in ident.attrib:
|
||||||
|
haveid = True
|
||||||
|
if not haveuuid and haveid:
|
||||||
|
bookid = "urn:uuid:%s" % str(uuid.uuid4())
|
||||||
|
metadata.add('identifier', bookid, id='calibre-uuid')
|
||||||
for item in metadata.identifier:
|
for item in metadata.identifier:
|
||||||
if item.id == uid:
|
if item.id == uid:
|
||||||
self.uid = item
|
self.uid = item
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
|
self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
|
||||||
self.uid = metadata.identifier[0]
|
for ident in metadata.identifier:
|
||||||
|
if 'id' in ident.attrib:
|
||||||
|
self.uid = metadata.identifier[0]
|
||||||
|
break
|
||||||
if not metadata.language:
|
if not metadata.language:
|
||||||
self.logger.log_warn(u'Language not specified.')
|
self.logger.log_warn(u'Language not specified.')
|
||||||
metadata.add('language', 'en')
|
metadata.add('language', 'en')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user