IGN:Fix a few bugs and some code cleanup in oeb.base

This commit is contained in:
Kovid Goyal 2009-02-04 18:22:37 -08:00
parent 6c93872167
commit 70630c5dbc
2 changed files with 141 additions and 112 deletions

View File

@ -24,6 +24,8 @@ mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
mimetypes.add_type('application/adobe-page-template+xml', '.xpgt') mimetypes.add_type('application/adobe-page-template+xml', '.xpgt')
mimetypes.add_type('application/x-font-opentype', '.otf') mimetypes.add_type('application/x-font-opentype', '.otf')
mimetypes.add_type('application/x-font-truetype', '.ttf') mimetypes.add_type('application/x-font-truetype', '.ttf')
mimetypes.add_type('application/oebps-package+xml', '.opf')
def to_unicode(raw, encoding='utf-8', errors='strict'): def to_unicode(raw, encoding='utf-8', errors='strict'):
if isinstance(raw, unicode): if isinstance(raw, unicode):

View File

@ -6,20 +6,14 @@ from __future__ import with_statement
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import os import os, sys, re, uuid, copy
import sys from mimetypes import types_map, guess_type
from collections import defaultdict from collections import defaultdict
from types import StringTypes from types import StringTypes
from itertools import izip, count, chain from itertools import izip, count, chain
from urlparse import urldefrag, urlparse, urlunparse from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
import logging from lxml import etree, html
import re
import uuid
import copy
import mimetypes
from lxml import etree
from lxml import html
import calibre import calibre
from calibre import LoggingInterface from calibre import LoggingInterface
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
@ -45,22 +39,44 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
SVG_NS = 'http://www.w3.org/2000/svg' SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink' XLINK_NS = 'http://www.w3.org/1999/xlink'
CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata'
XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, XPNSMAP = {
'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS,
'svg': SVG_NS, 'xl': XLINK_NS} 'svg': SVG_NS, 'xl' : XLINK_NS
}
DC_PREFIXES = ('d11', 'd10', 'd09') DC_PREFIXES = ('d11', 'd10', 'd09')
def XML(name): return '{%s}%s' % (XML_NS, name)
def XHTML(name): return '{%s}%s' % (XHTML_NS, name) def XML(name):
def OPF(name): return '{%s}%s' % (OPF2_NS, name) return '{%s}%s' % (XML_NS, name)
def DC(name): return '{%s}%s' % (DC11_NS, name)
def XSI(name): return '{%s}%s' % (XSI_NS, name) def XHTML(name):
def DCTERMS(name): return '{%s}%s' % (DCTERMS_NS, name) return '{%s}%s' % (XHTML_NS, name)
def NCX(name): return '{%s}%s' % (NCX_NS, name)
def SVG(name): return '{%s}%s' % (SVG_NS, name) def OPF(name):
def XLINK(name): return '{%s}%s' % (XLINK_NS, name) return '{%s}%s' % (OPF2_NS, name)
def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name)
def DC(name):
return '{%s}%s' % (DC11_NS, name)
def XSI(name):
return '{%s}%s' % (XSI_NS, name)
def DCTERMS(name):
return '{%s}%s' % (DCTERMS_NS, name)
def NCX(name):
return '{%s}%s' % (NCX_NS, name)
def SVG(name):
return '{%s}%s' % (SVG_NS, name)
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
def CALIBRE(name):
return '{%s}%s' % (CALIBRE_NS, name)
def LINK_SELECTORS(): def LINK_SELECTORS():
results = [] results = []
@ -70,21 +86,22 @@ def LINK_SELECTORS():
'o2:page/@href'): 'o2:page/@href'):
results.append(etree.XPath(expr, namespaces=XPNSMAP)) results.append(etree.XPath(expr, namespaces=XPNSMAP))
return results return results
LINK_SELECTORS = LINK_SELECTORS() LINK_SELECTORS = LINK_SELECTORS()
EPUB_MIME = 'application/epub+zip' EPUB_MIME = types_map['.epub']
XHTML_MIME = 'application/xhtml+xml' XHTML_MIME = types_map['.xhtml']
CSS_MIME = 'text/css' CSS_MIME = types_map['.css']
NCX_MIME = 'application/x-dtbncx+xml' NCX_MIME = types_map['.ncx']
OPF_MIME = 'application/oebps-package+xml' OPF_MIME = types_map['.opf']
PAGE_MAP_MIME = 'application/oebps-page-map+xml' PAGE_MAP_MIME = 'application/oebps-page-map+xml'
OEB_DOC_MIME = 'text/x-oeb1-document' OEB_DOC_MIME = 'text/x-oeb1-document'
OEB_CSS_MIME = 'text/x-oeb1-css' OEB_CSS_MIME = 'text/x-oeb1-css'
OPENTYPE_MIME = 'font/opentype' OPENTYPE_MIME = 'font/opentype' # Shouldn't this be 'application/x-font-opentype' as opentype/font doesn't actually exist in the IETF?
GIF_MIME = 'image/gif' GIF_MIME = types_map['.gif']
JPEG_MIME = 'image/jpeg' JPEG_MIME = types_map['.jpeg']
PNG_MIME = 'image/png' PNG_MIME = types_map['.png']
SVG_MIME = 'image/svg+xml' SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream' BINARY_MIME = 'application/octet-stream'
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
@ -159,7 +176,8 @@ URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz' 'abcdefghijklmnopqrstuvwxyz'
'0123456789' '_.-/~') '0123456789' '_.-/~')
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE] URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
def urlquote(href):
def urlquote(href): # Why do you have a private implementation of urlquote?
result = [] result = []
unsafe = 0 if isinstance(href, unicode) else 1 unsafe = 0 if isinstance(href, unicode) else 1
unsafe = URL_UNSAFE[unsafe] unsafe = URL_UNSAFE[unsafe]
@ -245,10 +263,12 @@ class DirWriter(object):
class Metadata(object): class Metadata(object):
DC_TERMS = set(['contributor', 'coverage', 'creator', 'date', DC_TERMS = set([
'contributor', 'coverage', 'creator', 'date',
'description', 'format', 'identifier', 'language', 'description', 'format', 'identifier', 'language',
'publisher', 'relation', 'rights', 'source', 'subject', 'publisher', 'relation', 'rights', 'source', 'subject',
'title', 'type']) 'title', 'type'
])
CALIBRE_TERMS = set(['series', 'series_index', 'rating']) CALIBRE_TERMS = set(['series', 'series_index', 'rating'])
OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'),
'scheme': OPF('scheme'), 'event': OPF('event'), 'scheme': OPF('scheme'), 'event': OPF('event'),
@ -258,11 +278,11 @@ class Metadata(object):
'xsi': XSI_NS, 'calibre': CALIBRE_NS} 'xsi': XSI_NS, 'calibre': CALIBRE_NS}
class Item(object): class Item(object):
class Attribute(object): class Attribute(object):
def __init__(self, attr, allowed=None): def __init__(self, attr, allowed=None):
if not callable(attr): self.attr = attr if callable(attr) else lambda x: attr
attr_, attr = attr, lambda term: attr_
self.attr = attr
self.allowed = allowed self.allowed = allowed
def term_attr(self, obj): def term_attr(self, obj):
@ -309,11 +329,8 @@ class Metadata(object):
if attr != nsattr: if attr != nsattr:
attrib[nsattr] = attrib.pop(attr) attrib[nsattr] = attrib.pop(attr)
def scheme(term): scheme = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'),
if term == OPF('meta'): [DC('identifier'), OPF('meta')])
return 'scheme'
return OPF('scheme')
scheme = Attribute(scheme, [DC('identifier'), OPF('meta')])
file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')])
role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
event = Attribute(OPF('event'), [DC('date')]) event = Attribute(OPF('event'), [DC('date')])
@ -400,6 +417,7 @@ class Metadata(object):
def __getattr__(self, term): def __getattr__(self, term):
return self.items[term] return self.items[term]
@apply
def _nsmap(): def _nsmap():
def fget(self): def fget(self):
nsmap = {} nsmap = {}
@ -408,8 +426,8 @@ class Metadata(object):
nsmap.update(item.nsmap) nsmap.update(item.nsmap)
return nsmap return nsmap
return property(fget=fget) return property(fget=fget)
_nsmap = _nsmap()
@apply
def _opf1_nsmap(): def _opf1_nsmap():
def fget(self): def fget(self):
nsmap = self._nsmap nsmap = self._nsmap
@ -418,15 +436,16 @@ class Metadata(object):
del nsmap[key] del nsmap[key]
return nsmap return nsmap
return property(fget=fget) return property(fget=fget)
_opf1_nsmap = _opf1_nsmap()
@apply
def _opf2_nsmap(): def _opf2_nsmap():
def fget(self): def fget(self):
nsmap = self._nsmap nsmap = self._nsmap
nsmap.update(self.OPF2_NSMAP) nsmap.update(self.OPF2_NSMAP)
return nsmap return nsmap
return property(fget=fget) return property(fget=fget)
_opf2_nsmap = _opf2_nsmap()
def to_opf1(self, parent=None): def to_opf1(self, parent=None):
nsmap = self._opf1_nsmap nsmap = self._opf1_nsmap
@ -453,7 +472,9 @@ class Metadata(object):
class Manifest(object): class Manifest(object):
class Item(object): class Item(object):
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
@ -553,6 +574,7 @@ class Manifest(object):
etree.SubElement(data, XHTML('body')) etree.SubElement(data, XHTML('body'))
return data return data
@apply
def data(): def data():
def fget(self): def fget(self):
if self._data is not None: if self._data is not None:
@ -571,7 +593,6 @@ class Manifest(object):
def fdel(self): def fdel(self):
self._data = None self._data = None
return property(fget, fset, fdel) return property(fget, fset, fdel)
data = data()
def __str__(self): def __str__(self):
data = self.data data = self.data
@ -718,6 +739,7 @@ class Manifest(object):
class Spine(object): class Spine(object):
def __init__(self, oeb): def __init__(self, oeb):
self.oeb = oeb self.oeb = oeb
self.items = [] self.items = []
@ -783,7 +805,9 @@ class Spine(object):
class Guide(object): class Guide(object):
class Reference(object): class Reference(object):
_TYPES_TITLES = [('cover', __('Cover')), _TYPES_TITLES = [('cover', __('Cover')),
('title-page', __('Title Page')), ('title-page', __('Title Page')),
('toc', __('Table of Contents')), ('toc', __('Table of Contents')),
@ -822,24 +846,24 @@ class Guide(object):
return 'Reference(type=%r, title=%r, href=%r)' \ return 'Reference(type=%r, title=%r, href=%r)' \
% (self.type, self.title, self.href) % (self.type, self.title, self.href)
@apply
def _order(): def _order():
def fget(self): def fget(self):
return self.ORDER.get(self.type, self.type) return self.ORDER.get(self.type, self.type)
return property(fget=fget) return property(fget=fget)
_order = _order()
def __cmp__(self, other): def __cmp__(self, other):
if not isinstance(other, Guide.Reference): if not isinstance(other, Guide.Reference):
return NotImplemented return NotImplemented
return cmp(self._order, other._order) return cmp(self._order, other._order)
@apply
def item(): def item():
def fget(self): def fget(self):
path, frag = urldefrag(self.href) path = urldefrag(self.href)[0]
hrefs = self.oeb.manifest.hrefs hrefs = self.oeb.manifest.hrefs
return hrefs.get(path, None) return hrefs.get(path, None)
return property(fget=fget) return property(fget=fget)
item = item()
def __init__(self, oeb): def __init__(self, oeb):
self.oeb = oeb self.oeb = oeb
@ -894,6 +918,7 @@ class Guide(object):
class TOC(object): class TOC(object):
# This needs beefing up to support the interface of toc.TOC
def __init__(self, title=None, href=None, klass=None, id=None): def __init__(self, title=None, href=None, klass=None, id=None):
self.title = title self.title = title
self.href = urlnormalize(href) if href else href self.href = urlnormalize(href) if href else href
@ -956,6 +981,7 @@ class TOC(object):
class PageList(object): class PageList(object):
class Page(object): class Page(object):
def __init__(self, name, href, type='normal', klass=None, id=None): def __init__(self, name, href, type='normal', klass=None, id=None):
self.name = name self.name = name
@ -977,7 +1003,7 @@ class PageList(object):
def __iter__(self): def __iter__(self):
for page in self.pages: for page in self.pages:
yield node yield page
def __getitem__(self, index): def __getitem__(self, index):
return self.pages[index] return self.pages[index]
@ -1006,6 +1032,7 @@ class PageList(object):
class OEBBook(object): class OEBBook(object):
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
@ -1148,7 +1175,7 @@ class OEBBook(object):
continue continue
self.logger.warn('Referenced file %r not in manifest' % href) self.logger.warn('Referenced file %r not in manifest' % href)
id, _ = manifest.generate(id='added') id, _ = manifest.generate(id='added')
guessed = mimetypes.guess_type(href)[0] guessed = guess_type(href)[0]
media_type = guessed or BINARY_MIME media_type = guessed or BINARY_MIME
added = manifest.add(id, href, media_type) added = manifest.add(id, href, media_type)
unchecked.add(added) unchecked.add(added)
@ -1162,7 +1189,7 @@ class OEBBook(object):
if media_type is None: if media_type is None:
media_type = elem.get('mediatype', None) media_type = elem.get('mediatype', None)
if media_type is None or media_type == 'text/xml': if media_type is None or media_type == 'text/xml':
guessed = mimetypes.guess_type(href)[0] guessed = guess_type(href)[0]
media_type = guessed or media_type or BINARY_MIME media_type = guessed or media_type or BINARY_MIME
fallback = elem.get('fallback') fallback = elem.get('fallback')
if href in manifest.hrefs: if href in manifest.hrefs:
@ -1227,7 +1254,7 @@ class OEBBook(object):
self.guide = guide = Guide(self) self.guide = guide = Guide(self)
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
href = elem.get('href') href = elem.get('href')
path, frag = urldefrag(href) path = urldefrag(href)[0]
if path not in self.manifest.hrefs: if path not in self.manifest.hrefs:
self.logger.warn(u'Guide reference %r not found' % href) self.logger.warn(u'Guide reference %r not found' % href)
continue continue
@ -1524,14 +1551,14 @@ class OEBBook(object):
def to_opf1(self): def to_opf1(self):
package = etree.Element('package', package = etree.Element('package',
attrib={'unique-identifier': self.uid.id}) attrib={'unique-identifier': self.uid.id})
metadata = self.metadata.to_opf1(package) self.metadata.to_opf1(package)
manifest = self.manifest.to_opf1(package) self.manifest.to_opf1(package)
spine = self.spine.to_opf1(package) self.spine.to_opf1(package)
tours = element(package, 'tours') tours = element(package, 'tours')
tour = element(tours, 'tour', tour = element(tours, 'tour',
attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) attrib={'id': 'chaptertour', 'title': 'Chapter Tour'})
self.toc.to_opf1(tour) self.toc.to_opf1(tour)
guide = self.guide.to_opf1(package) self.guide.to_opf1(package)
return {OPF_MIME: ('content.opf', package)} return {OPF_MIME: ('content.opf', package)}
def _update_playorder(self, ncx): def _update_playorder(self, ncx):
@ -1597,10 +1624,10 @@ class OEBBook(object):
package = etree.Element(OPF('package'), package = etree.Element(OPF('package'),
attrib={'version': '2.0', 'unique-identifier': self.uid.id}, attrib={'version': '2.0', 'unique-identifier': self.uid.id},
nsmap={None: OPF2_NS}) nsmap={None: OPF2_NS})
metadata = self.metadata.to_opf2(package) self.metadata.to_opf2(package)
manifest = self.manifest.to_opf2(package) manifest = self.manifest.to_opf2(package)
spine = self.spine.to_opf2(package) spine = self.spine.to_opf2(package)
guide = self.guide.to_opf2(package) self.guide.to_opf2(package)
results[OPF_MIME] = ('content.opf', package) results[OPF_MIME] = ('content.opf', package)
id, href = self.manifest.generate('ncx', 'toc.ncx') id, href = self.manifest.generate('ncx', 'toc.ncx')
etree.SubElement(manifest, OPF('item'), id=id, href=href, etree.SubElement(manifest, OPF('item'), id=id, href=href,