IGN:Fix a few bugs and some code cleanup in oeb.base

2025-07-09 03:04:10 -04:00 · 2009-02-04 18:22:37 -08:00 · 2009-02-04 18:22:37 -08:00 · 70630c5dbc
commit 70630c5dbc
parent 6c93872167
2 changed files with 141 additions and 112 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -17,13 +17,15 @@ from calibre.constants import iswindows, isosx, islinux, isfrozen, \
                              filesystem_encoding
 import mechanize

-mimetypes.add_type('application/epub+zip', '.epub')
-mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
-mimetypes.add_type('application/x-sony-bbeb', '.lrf')
-mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
+mimetypes.add_type('application/epub+zip',                '.epub')
+mimetypes.add_type('text/x-sony-bbeb+xml',                '.lrs')
+mimetypes.add_type('application/x-sony-bbeb',             '.lrf')
+mimetypes.add_type('application/x-dtbncx+xml',            '.ncx')
 mimetypes.add_type('application/adobe-page-template+xml', '.xpgt')
-mimetypes.add_type('application/x-font-opentype', '.otf')
-mimetypes.add_type('application/x-font-truetype', '.ttf')
+mimetypes.add_type('application/x-font-opentype',         '.otf')
+mimetypes.add_type('application/x-font-truetype',         '.ttf')
+mimetypes.add_type('application/oebps-package+xml',       '.opf')
+

 def to_unicode(raw, encoding='utf-8', errors='strict'):
    if isinstance(raw, unicode):
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -6,20 +6,14 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-import os
-import sys
+import os, sys, re, uuid, copy
+from mimetypes import types_map, guess_type
 from collections import defaultdict
 from types import StringTypes
 from itertools import izip, count, chain
 from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
-import logging
-import re
-import uuid
-import copy
-import mimetypes
-from lxml import etree
-from lxml import html
+from lxml import etree, html
 import calibre
 from calibre import LoggingInterface
 from calibre.translations.dynamic import translate
@ -29,38 +23,60 @@ from calibre.ebooks.oeb.entitydefs import ENTITYDEFS
 from calibre.ebooks.metadata.epub import CoverRenderer
 from calibre.ptempfile import TemporaryDirectory

-XML_NS = 'http://www.w3.org/XML/1998/namespace'
-XHTML_NS = 'http://www.w3.org/1999/xhtml'
-OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
-OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
-OPF2_NS = 'http://www.idpf.org/2007/opf'
-OPF_NSES = set([OPF1_NS, OPF2_NS])
-DC09_NS = 'http://purl.org/metadata/dublin_core'
-DC10_NS = 'http://purl.org/dc/elements/1.0/'
-DC11_NS = 'http://purl.org/dc/elements/1.1/'
-DC_NSES = set([DC09_NS, DC10_NS, DC11_NS])
-XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
-DCTERMS_NS = 'http://purl.org/dc/terms/'
-NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
-SVG_NS = 'http://www.w3.org/2000/svg'
-XLINK_NS = 'http://www.w3.org/1999/xlink'
-CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata'
-XPNSMAP = {'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS,
-           'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
-           'xsi': XSI_NS, 'dt': DCTERMS_NS, 'ncx': NCX_NS,
-           'svg': SVG_NS, 'xl': XLINK_NS}
+XML_NS       = 'http://www.w3.org/XML/1998/namespace'
+XHTML_NS     = 'http://www.w3.org/1999/xhtml'
+OEB_DOC_NS   = 'http://openebook.org/namespaces/oeb-document/1.0/'
+OPF1_NS      = 'http://openebook.org/namespaces/oeb-package/1.0/'
+OPF2_NS      = 'http://www.idpf.org/2007/opf'
+OPF_NSES     = set([OPF1_NS, OPF2_NS])
+DC09_NS      = 'http://purl.org/metadata/dublin_core'
+DC10_NS      = 'http://purl.org/dc/elements/1.0/'
+DC11_NS      = 'http://purl.org/dc/elements/1.1/'
+DC_NSES      = set([DC09_NS, DC10_NS, DC11_NS])
+XSI_NS       = 'http://www.w3.org/2001/XMLSchema-instance'
+DCTERMS_NS   = 'http://purl.org/dc/terms/'
+NCX_NS       = 'http://www.daisy.org/z3986/2005/ncx/'
+SVG_NS       = 'http://www.w3.org/2000/svg'
+XLINK_NS     = 'http://www.w3.org/1999/xlink'
+CALIBRE_NS   = 'http://calibre.kovidgoyal.net/2009/metadata'
+XPNSMAP      = {
+                   'h'  : XHTML_NS, 'o1' : OPF1_NS,    'o2' : OPF2_NS,
+                   'd09': DC09_NS,  'd10': DC10_NS,    'd11': DC11_NS,
+                   'xsi': XSI_NS,   'dt' : DCTERMS_NS, 'ncx': NCX_NS,
+                   'svg': SVG_NS,   'xl' : XLINK_NS
+               }
 DC_PREFIXES = ('d11', 'd10', 'd09')

-def XML(name): return '{%s}%s' % (XML_NS, name)
-def XHTML(name): return '{%s}%s' % (XHTML_NS, name)
-def OPF(name): return '{%s}%s' % (OPF2_NS, name)
-def DC(name): return '{%s}%s' % (DC11_NS, name)
-def XSI(name): return '{%s}%s' % (XSI_NS, name)
-def DCTERMS(name): return '{%s}%s' % (DCTERMS_NS, name)
-def NCX(name): return '{%s}%s' % (NCX_NS, name)
-def SVG(name): return '{%s}%s' % (SVG_NS, name)
-def XLINK(name): return '{%s}%s' % (XLINK_NS, name)
-def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name)
+
+def XML(name): 
+    return '{%s}%s' % (XML_NS, name)
+
+def XHTML(name): 
+    return '{%s}%s' % (XHTML_NS, name)
+
+def OPF(name): 
+    return '{%s}%s' % (OPF2_NS, name)
+
+def DC(name): 
+    return '{%s}%s' % (DC11_NS, name)
+
+def XSI(name): 
+    return '{%s}%s' % (XSI_NS, name)
+
+def DCTERMS(name): 
+    return '{%s}%s' % (DCTERMS_NS, name)
+
+def NCX(name): 
+    return '{%s}%s' % (NCX_NS, name)
+
+def SVG(name): 
+    return '{%s}%s' % (SVG_NS, name)
+
+def XLINK(name): 
+    return '{%s}%s' % (XLINK_NS, name)
+
+def CALIBRE(name): 
+    return '{%s}%s' % (CALIBRE_NS, name)

 def LINK_SELECTORS():
    results = []
@ -70,36 +86,37 @@ def LINK_SELECTORS():
                 'o2:page/@href'):
        results.append(etree.XPath(expr, namespaces=XPNSMAP))
    return results
+
 LINK_SELECTORS = LINK_SELECTORS()

-EPUB_MIME = 'application/epub+zip'
-XHTML_MIME = 'application/xhtml+xml'
-CSS_MIME = 'text/css'
-NCX_MIME = 'application/x-dtbncx+xml'
-OPF_MIME = 'application/oebps-package+xml'
-PAGE_MAP_MIME = 'application/oebps-page-map+xml'
-OEB_DOC_MIME = 'text/x-oeb1-document'
-OEB_CSS_MIME = 'text/x-oeb1-css'
-OPENTYPE_MIME = 'font/opentype'
-GIF_MIME = 'image/gif'
-JPEG_MIME = 'image/jpeg'
-PNG_MIME = 'image/png'
-SVG_MIME = 'image/svg+xml'
-BINARY_MIME = 'application/octet-stream'
+EPUB_MIME      = types_map['.epub']
+XHTML_MIME     = types_map['.xhtml']
+CSS_MIME       = types_map['.css']
+NCX_MIME       = types_map['.ncx']
+OPF_MIME       = types_map['.opf']
+PAGE_MAP_MIME  = 'application/oebps-page-map+xml'
+OEB_DOC_MIME   = 'text/x-oeb1-document'
+OEB_CSS_MIME   = 'text/x-oeb1-css'
+OPENTYPE_MIME  = 'font/opentype' # Shouldn't this be 'application/x-font-opentype' as opentype/font doesn't actually exist in the IETF?
+GIF_MIME       = types_map['.gif']
+JPEG_MIME      = types_map['.jpeg']
+PNG_MIME       = types_map['.png']
+SVG_MIME       = types_map['.svg']
+BINARY_MIME    = 'application/octet-stream'

-OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
-OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
+OEB_STYLES        = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])
+OEB_DOCS          = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])
 OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])
-OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
+OEB_IMAGES        = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])

 MS_COVER_TYPE = 'other.ms-coverimage-standard'

-ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
-COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
-QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
+ENTITY_RE     = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
+COLLAPSE_RE   = re.compile(r'[ \t\r\n\v]+')
+QNAME_RE      = re.compile(r'^[{][^{}]+[}][^{}]+$')
 PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
-XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
-CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
+XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
+CSSURL_RE     = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')

 RECOVER_PARSER = etree.XMLParser(recover=True)

@ -153,13 +170,14 @@ def xpath(elem, expr):
 def xml2str(root):
    return etree.tostring(root, encoding='utf-8', xml_declaration=True)

-ASCII_CHARS = set(chr(x) for x in xrange(128))
+ASCII_CHARS   = set(chr(x) for x in xrange(128))
 UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
-URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-               'abcdefghijklmnopqrstuvwxyz'
-               '0123456789' '_.-/~')
+URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+                    'abcdefghijklmnopqrstuvwxyz'
+                    '0123456789' '_.-/~')
 URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
-def urlquote(href):
+
+def urlquote(href): # Why do you have a private implementation of urlquote?
    result = []
    unsafe = 0 if isinstance(href, unicode) else 1
    unsafe = URL_UNSAFE[unsafe]
@ -245,24 +263,26 @@ class DirWriter(object):


 class Metadata(object):
-    DC_TERMS = set(['contributor', 'coverage', 'creator', 'date',
+    DC_TERMS      = set([
+                    'contributor', 'coverage', 'creator', 'date',
                    'description', 'format', 'identifier', 'language',
                    'publisher', 'relation', 'rights', 'source', 'subject',
-                    'title', 'type'])
+                    'title', 'type'
+                    ])
    CALIBRE_TERMS = set(['series', 'series_index', 'rating'])
-    OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'),
-                 'scheme': OPF('scheme'), 'event': OPF('event'),
-                 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}
-    OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
-    OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
-                  'xsi': XSI_NS, 'calibre': CALIBRE_NS}
+    OPF_ATTRS     = {'role': OPF('role'), 'file-as': OPF('file-as'),
+                     'scheme': OPF('scheme'), 'event': OPF('event'),
+                     'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}
+    OPF1_NSMAP    = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
+    OPF2_NSMAP    = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
+                     'xsi': XSI_NS, 'calibre': CALIBRE_NS}
    
    class Item(object):
+        
        class Attribute(object):
+            
            def __init__(self, attr, allowed=None):
-                if not callable(attr):
-                    attr_, attr = attr, lambda term: attr_
-                self.attr = attr
+                self.attr = attr if callable(attr) else lambda x: attr
                self.allowed = allowed
            
            def term_attr(self, obj):
@ -309,17 +329,14 @@ class Metadata(object):
                if attr != nsattr:
                    attrib[nsattr] = attrib.pop(attr)
        
-        def scheme(term):
-            if term == OPF('meta'):
-                return 'scheme'
-            return OPF('scheme')
-        scheme = Attribute(scheme, [DC('identifier'), OPF('meta')])
+        scheme  = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'), 
+                           [DC('identifier'), OPF('meta')])
        file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')])
-        role = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
-        event = Attribute(OPF('event'), [DC('date')])
-        id = Attribute('id')
-        type = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')])
-        lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),
+        role    = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
+        event   = Attribute(OPF('event'), [DC('date')])
+        id      = Attribute('id')
+        type    = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')])
+        lang    = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),
                                       DC('creator'), DC('publisher'),
                                       DC('relation'), DC('rights'),
                                       DC('source'), DC('subject'),
@ -400,6 +417,7 @@ class Metadata(object):
    def __getattr__(self, term):
        return self.items[term]

+    @apply
    def _nsmap():
        def fget(self):
            nsmap = {}
@ -408,8 +426,8 @@ class Metadata(object):
                    nsmap.update(item.nsmap)
            return nsmap
        return property(fget=fget)
-    _nsmap = _nsmap()        
    
+    @apply
    def _opf1_nsmap():
        def fget(self):
            nsmap = self._nsmap
@ -418,15 +436,16 @@ class Metadata(object):
                    del nsmap[key]
            return nsmap
        return property(fget=fget)
-    _opf1_nsmap = _opf1_nsmap()
    
+    
+    @apply
    def _opf2_nsmap():
        def fget(self):
            nsmap = self._nsmap
            nsmap.update(self.OPF2_NSMAP)
            return nsmap
        return property(fget=fget)
-    _opf2_nsmap = _opf2_nsmap()
+    
    
    def to_opf1(self, parent=None):
        nsmap = self._opf1_nsmap
@ -453,7 +472,9 @@ class Metadata(object):


 class Manifest(object):
+    
    class Item(object):
+    
        NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
        META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
    
@ -553,6 +574,7 @@ class Manifest(object):
                etree.SubElement(data, XHTML('body'))
            return data
        
+        @apply
        def data():
            def fget(self):
                if self._data is not None:
@ -571,8 +593,7 @@ class Manifest(object):
            def fdel(self):
                self._data = None
            return property(fget, fset, fdel)
-        data = data()
-        
+                
        def __str__(self):
            data = self.data
            if isinstance(data, etree._Element):
@ -718,6 +739,7 @@ class Manifest(object):


 class Spine(object):
+    
    def __init__(self, oeb):
        self.oeb = oeb
        self.items = []
@ -783,7 +805,9 @@ class Spine(object):


 class Guide(object):
+    
    class Reference(object):
+        
        _TYPES_TITLES = [('cover', __('Cover')),
                         ('title-page', __('Title Page')),
                         ('toc', __('Table of Contents')),
@ -822,24 +846,24 @@ class Guide(object):
            return 'Reference(type=%r, title=%r, href=%r)' \
                % (self.type, self.title, self.href)
        
+        @apply
        def _order():
            def fget(self):
                return self.ORDER.get(self.type, self.type)
            return property(fget=fget)
-        _order = _order()
        
        def __cmp__(self, other):
            if not isinstance(other, Guide.Reference):
                return NotImplemented
            return cmp(self._order, other._order)
        
+        @apply
        def item():
            def fget(self):
-                path, frag = urldefrag(self.href)
+                path = urldefrag(self.href)[0]
                hrefs = self.oeb.manifest.hrefs
                return hrefs.get(path, None)
            return property(fget=fget)
-        item = item()
    
    def __init__(self, oeb):
        self.oeb = oeb
@ -894,6 +918,7 @@ class Guide(object):


 class TOC(object):
+    # This needs beefing up to support the interface of toc.TOC
    def __init__(self, title=None, href=None, klass=None, id=None):
        self.title = title
        self.href = urlnormalize(href) if href else href
@ -956,6 +981,7 @@ class TOC(object):


 class PageList(object):
+    
    class Page(object):
        def __init__(self, name, href, type='normal', klass=None, id=None):
            self.name = name
@ -977,7 +1003,7 @@ class PageList(object):
    
    def __iter__(self):
        for page in self.pages:
-            yield node
+            yield page
    
    def __getitem__(self, index):
        return self.pages[index]
@ -1006,7 +1032,8 @@ class PageList(object):


 class OEBBook(object):
-    COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
+    
+    COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')

    def __init__(self, opfpath=None, container=None, encoding=None,
@ -1148,7 +1175,7 @@ class OEBBook(object):
                    continue
                self.logger.warn('Referenced file %r not in manifest' % href)
                id, _ = manifest.generate(id='added')
-                guessed = mimetypes.guess_type(href)[0]
+                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)
@ -1162,7 +1189,7 @@ class OEBBook(object):
            if media_type is None:
                media_type = elem.get('mediatype', None)
            if media_type is None or media_type == 'text/xml':
-                guessed = mimetypes.guess_type(href)[0]
+                guessed = guess_type(href)[0]
                media_type = guessed or media_type or BINARY_MIME
            fallback = elem.get('fallback')
            if href in manifest.hrefs:
@ -1227,7 +1254,7 @@ class OEBBook(object):
        self.guide = guide = Guide(self)
        for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
            href = elem.get('href')
-            path, frag = urldefrag(href)
+            path = urldefrag(href)[0]
            if path not in self.manifest.hrefs:
                self.logger.warn(u'Guide reference %r not found' % href)
                continue
@ -1524,14 +1551,14 @@ class OEBBook(object):
    def to_opf1(self):
        package = etree.Element('package',
            attrib={'unique-identifier': self.uid.id})
-        metadata = self.metadata.to_opf1(package)
-        manifest = self.manifest.to_opf1(package)
-        spine = self.spine.to_opf1(package)
+        self.metadata.to_opf1(package)
+        self.manifest.to_opf1(package)
+        self.spine.to_opf1(package)
        tours = element(package, 'tours')
        tour = element(tours, 'tour',
            attrib={'id': 'chaptertour', 'title': 'Chapter Tour'})
        self.toc.to_opf1(tour)
-        guide = self.guide.to_opf1(package)
+        self.guide.to_opf1(package)
        return {OPF_MIME: ('content.opf', package)}

    def _update_playorder(self, ncx):
@ -1597,10 +1624,10 @@ class OEBBook(object):
        package = etree.Element(OPF('package'),
            attrib={'version': '2.0', 'unique-identifier': self.uid.id},
            nsmap={None: OPF2_NS})
-        metadata = self.metadata.to_opf2(package)
+        self.metadata.to_opf2(package)
        manifest = self.manifest.to_opf2(package)
        spine = self.spine.to_opf2(package)
-        guide = self.guide.to_opf2(package)
+        self.guide.to_opf2(package)
        results[OPF_MIME] = ('content.opf', package)
        id, href = self.manifest.generate('ncx', 'toc.ncx')
        etree.SubElement(manifest, OPF('item'), id=id, href=href,