Kovid Goyal c439cc94f1 ...
2013-04-12 09:03:54 +05:30

1882 lines
68 KiB
Python

'''
Basic support for manipulating OEB 1.x/2.0 content and metadata.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, re, uuid, logging
from collections import defaultdict
from itertools import count
from urlparse import urldefrag, urlparse, urlunparse, urljoin
from urllib import unquote as urlunquote
from lxml import etree, html
from calibre.constants import filesystem_encoding, __version__
from calibre.translations.dynamic import translate
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
from calibre import (isbytestring, as_unicode, get_types_map)
from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
namespace, XHTML, parse_html, NotHTML)
XML_NS = 'http://www.w3.org/XML/1998/namespace'
OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/'
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
OPF2_NS = 'http://www.idpf.org/2007/opf'
OPF_NSES = set([OPF1_NS, OPF2_NS])
DC09_NS = 'http://purl.org/metadata/dublin_core'
DC10_NS = 'http://purl.org/dc/elements/1.0/'
DC11_NS = 'http://purl.org/dc/elements/1.1/'
DC_NSES = set([DC09_NS, DC10_NS, DC11_NS])
XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance'
DCTERMS_NS = 'http://purl.org/dc/terms/'
NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/'
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata'
RE_NS = 'http://exslt.org/regular-expressions'
MBP_NS = 'http://www.mobipocket.com'
XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS,
'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,
'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS,
'svg': SVG_NS, 'xl' : XLINK_NS, 're': RE_NS,
'mbp': MBP_NS, 'calibre': CALIBRE_NS }
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
'xsi': XSI_NS, 'calibre': CALIBRE_NS}
def XML(name):
return '{%s}%s' % (XML_NS, name)
def OPF(name):
return '{%s}%s' % (OPF2_NS, name)
def DC(name):
return '{%s}%s' % (DC11_NS, name)
def XSI(name):
return '{%s}%s' % (XSI_NS, name)
def DCTERMS(name):
return '{%s}%s' % (DCTERMS_NS, name)
def NCX(name):
return '{%s}%s' % (NCX_NS, name)
def SVG(name):
return '{%s}%s' % (SVG_NS, name)
def XLINK(name):
return '{%s}%s' % (XLINK_NS, name)
def CALIBRE(name):
return '{%s}%s' % (CALIBRE_NS, name)
_css_url_re = re.compile(r'url\s*\([\'"]{0,1}(.*?)[\'"]{0,1}\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_archive_re = re.compile(r'[^ ]+')
# Tags that should not be self closed in epub output
self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
'video'}
_self_closing_pat = re.compile(
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(self_closing_bad_tags)),
re.IGNORECASE)
def close_self_closing_tags(raw):
return _self_closing_pat.sub(r'<\g<tag>\g<arg>></\g<tag>>', raw)
def uuid_id():
return 'u'+unicode(uuid.uuid4())
def iterlinks(root, find_links_in_css=True):
'''
Iterate over all links in a OEB Document.
:param root: A valid lxml.etree element.
'''
assert etree.iselement(root)
link_attrs = set(html.defs.link_attrs)
link_attrs.add(XLINK('href'))
for el in root.iter():
attribs = el.attrib
try:
tag = el.tag
except UnicodeDecodeError:
continue
if tag == XHTML('object'):
codebase = None
## <object> tags have attributes that are relative to
## codebase
if 'codebase' in attribs:
codebase = el.get('codebase')
yield (el, 'codebase', codebase, 0)
for attrib in 'classid', 'data':
if attrib in attribs:
value = el.get(attrib)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, attrib, value, 0)
if 'archive' in attribs:
for match in _archive_re.finditer(el.get('archive')):
value = match.group(0)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, 'archive', value, match.start())
else:
for attr in attribs:
if attr in link_attrs:
yield (el, attr, attribs[attr], 0)
if not find_links_in_css:
continue
if tag == XHTML('style') and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
for match in _css_import_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
if 'style' in attribs:
for match in _css_url_re.finditer(attribs['style']):
yield (el, 'style', match.group(1), match.start(1))
def make_links_absolute(root, base_url):
'''
Make all links in the document absolute, given the
``base_url`` for the document (the full URL where the document
came from)
'''
def link_repl(href):
return urljoin(base_url, href)
rewrite_links(root, link_repl)
def resolve_base_href(root):
base_href = None
basetags = root.xpath('//base[@href]|//h:base[@href]',
namespaces=XPNSMAP)
for b in basetags:
base_href = b.get('href')
b.drop_tree()
if not base_href:
return
make_links_absolute(root, base_href, resolve_base_href=False)
def rewrite_links(root, link_repl_func, resolve_base_href=False):
'''
Rewrite all the links in the document. For each link
``link_repl_func(link)`` will be called, and the return value
will replace the old link.
Note that links may not be absolute (unless you first called
``make_links_absolute()``), and may be internal (e.g.,
``'#anchor'``). They can also be values like
``'mailto:email'`` or ``'javascript:expr'``.
If the ``link_repl_func`` returns None, the attribute or
tag text will be removed completely.
'''
from cssutils import replaceUrls, log, CSSParser
log.setLevel(logging.WARN)
log.raiseExceptions = False
if resolve_base_href:
resolve_base_href(root)
for el, attrib, link, pos in iterlinks(root, find_links_in_css=False):
new_link = link_repl_func(link.strip())
if new_link == link:
continue
if new_link is None:
# Remove the attribute or element content
if attrib is None:
el.text = ''
else:
del el.attrib[attrib]
continue
if attrib is None:
new = el.text[:pos] + new_link + el.text[pos+len(link):]
el.text = new
else:
cur = el.attrib[attrib]
if not pos and len(cur) == len(link):
# Most common case
el.attrib[attrib] = new_link
else:
new = cur[:pos] + new_link + cur[pos+len(link):]
el.attrib[attrib] = new
parser = CSSParser(raiseExceptions=False, log=_css_logger,
fetcher=lambda x:(None, None))
for el in root.iter(etree.Element):
try:
tag = el.tag
except UnicodeDecodeError:
continue
if tag == XHTML('style') and el.text and \
(_css_url_re.search(el.text) is not None or '@import' in
el.text):
stylesheet = parser.parseString(el.text, validate=False)
replaceUrls(stylesheet, link_repl_func)
repl = stylesheet.cssText
if isbytestring(repl):
repl = repl.decode('utf-8')
el.text = '\n'+ repl + '\n'
if 'style' in el.attrib:
text = el.attrib['style']
if _css_url_re.search(text) is not None:
try:
stext = parser.parseStyle(text, validate=False)
except:
# Parsing errors are raised by cssutils
continue
replaceUrls(stext, link_repl_func)
repl = stext.cssText.replace('\n', ' ').replace('\r',
' ')
if isbytestring(repl):
repl = repl.decode('utf-8')
el.attrib['style'] = repl
types_map = get_types_map()
EPUB_MIME = types_map['.epub']
XHTML_MIME = types_map['.xhtml']
CSS_MIME = types_map['.css']
NCX_MIME = types_map['.ncx']
OPF_MIME = types_map['.opf']
PAGE_MAP_MIME = 'application/oebps-page-map+xml'
OEB_DOC_MIME = 'text/x-oeb1-document'
OEB_CSS_MIME = 'text/x-oeb1-css'
OPENTYPE_MIME = types_map['.otf']
GIF_MIME = types_map['.gif']
JPEG_MIME = types_map['.jpeg']
PNG_MIME = types_map['.png']
SVG_MIME = types_map['.svg']
BINARY_MIME = 'application/octet-stream'
XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS
OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'])
OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME,
'text/x-oeb-document'])
OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])
OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
def element(parent, *args, **kwargs):
if parent is not None:
return etree.SubElement(parent, *args, **kwargs)
return etree.Element(*args, **kwargs)
def prefixname(name, nsrmap):
if not isqname(name):
return name
ns = namespace(name)
if ns not in nsrmap:
return name
prefix = nsrmap[ns]
if not prefix:
return barename(name)
return ':'.join((prefix, barename(name)))
def isprefixname(name):
return name and PREFIXNAME_RE.match(name) is not None
def qname(name, nsmap):
if not isprefixname(name):
return name
prefix, local = name.split(':', 1)
if prefix not in nsmap:
return name
return '{%s}%s' % (nsmap[prefix], local)
def isqname(name):
return name and QNAME_RE.match(name) is not None
def XPath(expr):
return etree.XPath(expr, namespaces=XPNSMAP)
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail)
if strip_comments:
ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
return ans
def xml2unicode(root, pretty_print=False):
return etree.tostring(root, pretty_print=pretty_print)
def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
def serialize(data, media_type, pretty_print=False):
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=pretty_print)
if media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return bytes(data)
ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789' '_.-/~')
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
def urlquote(href):
"""Quote URL-unsafe characters, allowing IRI-safe characters."""
result = []
unsafe = 0 if isinstance(href, unicode) else 1
unsafe = URL_UNSAFE[unsafe]
for char in href:
if char in unsafe:
char = "%%%02x" % ord(char)
result.append(char)
return ''.join(result)
def urlnormalize(href):
"""Convert a URL into normalized form, with all and only URL-unsafe
characters URL quoted.
"""
parts = urlparse(href)
if not parts.scheme or parts.scheme == 'file':
path, frag = urldefrag(href)
parts = ('', '', path, '', '', frag)
parts = (part.replace('\\', '/') for part in parts)
parts = (urlunquote(part) for part in parts)
parts = (urlquote(part) for part in parts)
return urlunparse(parts)
def extract(elem):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
parent.
"""
parent = elem.getparent()
if parent is not None:
if elem.tail:
previous = elem.getprevious()
if previous is None:
parent.text = (parent.text or '') + elem.tail
else:
previous.tail = (previous.tail or '') + elem.tail
parent.remove(elem)
class DummyHandler(logging.Handler):
def __init__(self):
logging.Handler.__init__(self, logging.WARNING)
self.setFormatter(logging.Formatter('%(message)s'))
self.log = None
def emit(self, record):
if self.log is not None:
msg = self.format(record)
f = self.log.error if record.levelno >= logging.ERROR \
else self.log.warn
f(msg)
_css_logger = logging.getLogger('calibre.css')
_css_logger.setLevel(logging.WARNING)
_css_log_handler = DummyHandler()
_css_logger.addHandler(_css_log_handler)
class OEBError(Exception):
"""Generic OEB-processing error."""
pass
class NullContainer(object):
"""An empty container.
For use with book formats which do not support container-like access.
"""
def __init__(self, log):
self.log = log
def read(self, path):
raise OEBError('Attempt to read from NullContainer')
def write(self, path):
raise OEBError('Attempt to write to NullContainer')
def exists(self, path):
return False
def namelist(self):
return []
class DirContainer(object):
"""Filesystem directory container."""
def __init__(self, path, log, ignore_opf=False):
self.log = log
if isbytestring(path):
path = path.decode(filesystem_encoding)
self.opfname = None
ext = os.path.splitext(path)[1].lower()
if ext == '.opf':
self.opfname = os.path.basename(path)
self.rootdir = os.path.dirname(path)
return
self.rootdir = path
if not ignore_opf:
for path in self.namelist():
ext = os.path.splitext(path)[1].lower()
if ext == '.opf':
self.opfname = path
return
def _unquote(self, path):
# urlunquote must run on a bytestring and will return a bytestring
# If it runs on a unicode object, it returns a double encoded unicode
# string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
# and the latter is correct
if isinstance(path, unicode):
path = path.encode('utf-8')
return urlunquote(path).decode('utf-8')
def read(self, path):
if path is None:
path = self.opfname
path = os.path.join(self.rootdir, self._unquote(path))
with open(path, 'rb') as f:
return f.read()
def write(self, path, data):
path = os.path.join(self.rootdir, self._unquote(path))
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
with open(path, 'wb') as f:
return f.write(data)
def exists(self, path):
if not path:
return False
try:
path = os.path.join(self.rootdir, self._unquote(path))
except ValueError: #Happens if path contains quoted special chars
return False
try:
return os.path.isfile(path)
except UnicodeEncodeError:
# On linux, if LANG is unset, the os.stat call tries to encode the
# unicode path using ASCII
# To replicate try:
# LANG=en_US.ASCII python -c "import os; os.stat(u'Espa\xf1a')"
return os.path.isfile(path.encode(filesystem_encoding))
def namelist(self):
names = []
base = self.rootdir
if isinstance(base, unicode):
base = base.encode(filesystem_encoding)
for root, dirs, files in os.walk(base):
for fname in files:
fname = os.path.join(root, fname)
fname = fname.replace('\\', '/')
if not isinstance(fname, unicode):
try:
fname = fname.decode(filesystem_encoding)
except:
continue
names.append(fname)
return names
class Metadata(object):
"""A collection of OEB data model metadata.
Provides access to the list of items associated with a particular metadata
term via the term's local name using either Python container or attribute
syntax. Return an empty list for any terms with no currently associated
metadata items.
"""
DC_TERMS = set(['contributor', 'coverage', 'creator', 'date',
'description', 'format', 'identifier', 'language',
'publisher', 'relation', 'rights', 'source',
'subject', 'title', 'type'])
CALIBRE_TERMS = set(['series', 'series_index', 'rating', 'timestamp',
'publication_type', 'title_sort'])
OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'),
'scheme': OPF('scheme'), 'event': OPF('event'),
'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}
OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
'xsi': XSI_NS, 'calibre': CALIBRE_NS}
class Item(object):
"""An item of OEB data model metadata.
The metadata term or name may be accessed via the :attr:`term` or
:attr:`name` attributes. The metadata value or content may be accessed
via the :attr:`value` or :attr:`content` attributes, or via Unicode or
string representations of the object.
OEB data model metadata attributes may be accessed either via their
fully-qualified names using the Python container access syntax, or via
their local names using Python attribute syntax. Only attributes
allowed by the OPF 2.0 specification are supported.
"""
class Attribute(object):
"""Smart accessor for allowed OEB metadata item attributes."""
def __init__(self, attr, allowed=None):
if not callable(attr):
attr_, attr = attr, lambda term: attr_
self.attr = attr
self.allowed = allowed
def term_attr(self, obj):
term = obj.term
if namespace(term) != DC11_NS:
term = OPF('meta')
allowed = self.allowed
if allowed is not None and term not in allowed:
raise AttributeError(
'attribute %r not valid for metadata term %r' \
% (self.attr(term), barename(obj.term)))
return self.attr(term)
def __get__(self, obj, cls):
if obj is None: return None
return obj.attrib.get(self.term_attr(obj), '')
def __set__(self, obj, value):
obj.attrib[self.term_attr(obj)] = value
def __init__(self, term, value, attrib={}, nsmap={}, **kwargs):
self.attrib = attrib = dict(attrib)
self.nsmap = nsmap = dict(nsmap)
attrib.update(kwargs)
if namespace(term) == OPF2_NS:
term = barename(term)
ns = namespace(term)
local = barename(term).lower()
if local in Metadata.DC_TERMS and (not ns or ns in DC_NSES):
# Anything looking like Dublin Core is coerced
term = DC(local)
elif local in Metadata.CALIBRE_TERMS and ns in (CALIBRE_NS, ''):
# Ditto for Calibre-specific metadata
term = CALIBRE(local)
self.term = term
self.value = value
for attr, value in attrib.items():
if isprefixname(value):
attrib[attr] = qname(value, nsmap)
nsattr = Metadata.OPF_ATTRS.get(attr, attr)
if nsattr == OPF('scheme') and namespace(term) != DC11_NS:
# The opf:meta element takes @scheme, not @opf:scheme
nsattr = 'scheme'
if attr != nsattr:
attrib[nsattr] = attrib.pop(attr)
@dynamic_property
def name(self):
def fget(self):
return self.term
return property(fget=fget)
@dynamic_property
def content(self):
def fget(self):
return self.value
def fset(self, value):
self.value = value
return property(fget=fget, fset=fset)
scheme = Attribute(lambda term: 'scheme' if \
term == OPF('meta') else OPF('scheme'),
[DC('identifier'), OPF('meta')])
file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),
DC('title')])
role = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
event = Attribute(OPF('event'), [DC('date')])
id = Attribute('id')
type = Attribute(XSI('type'), [DC('date'), DC('format'),
DC('type')])
lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),
DC('creator'), DC('publisher'),
DC('relation'), DC('rights'),
DC('source'), DC('subject'),
OPF('meta')])
def __getitem__(self, key):
return self.attrib[key]
def __setitem__(self, key, value):
self.attrib[key] = value
def __contains__(self, key):
return key in self.attrib
def get(self, key, default=None):
return self.attrib.get(key, default)
def __repr__(self):
return 'Item(term=%r, value=%r, attrib=%r)' \
% (barename(self.term), self.value, self.attrib)
def __str__(self):
return unicode(self.value).encode('ascii', 'xmlcharrefreplace')
def __unicode__(self):
return as_unicode(self.value)
def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
attrib = {}
for key, value in self.attrib.items():
if namespace(key) == OPF2_NS:
key = barename(key)
attrib[key] = prefixname(value, nsrmap)
if namespace(self.term) == DC11_NS:
name = DC(icu_title(barename(self.term)))
elem = element(dcmeta, name, attrib=attrib)
elem.text = self.value
else:
elem = element(xmeta, 'meta', attrib=attrib)
elem.attrib['name'] = prefixname(self.term, nsrmap)
elem.attrib['content'] = prefixname(self.value, nsrmap)
return elem
def to_opf2(self, parent=None, nsrmap={}):
attrib = {}
for key, value in self.attrib.items():
attrib[key] = prefixname(value, nsrmap)
if namespace(self.term) == DC11_NS:
elem = element(parent, self.term, attrib=attrib)
try:
elem.text = self.value
except:
elem.text = repr(self.value)
else:
elem = element(parent, OPF('meta'), attrib=attrib)
elem.attrib['name'] = prefixname(self.term, nsrmap)
elem.attrib['content'] = prefixname(self.value, nsrmap)
return elem
def __init__(self, oeb):
self.oeb = oeb
self.items = defaultdict(list)
def add(self, term, value, attrib={}, nsmap={}, **kwargs):
"""Add a new metadata item."""
item = self.Item(term, value, attrib, nsmap, **kwargs)
items = self.items[barename(item.term)]
items.append(item)
return item
def iterkeys(self):
for key in self.items:
yield key
__iter__ = iterkeys
def clear(self, key):
l = self.items[key]
for x in list(l):
l.remove(x)
def filter(self, key, predicate):
l = self.items[key]
for x in list(l):
if predicate(x):
l.remove(x)
def __getitem__(self, key):
return self.items[key]
def __contains__(self, key):
return key in self.items
def __getattr__(self, term):
return self.items[term]
@dynamic_property
def _nsmap(self):
def fget(self):
nsmap = {}
for term in self.items:
for item in self.items[term]:
nsmap.update(item.nsmap)
return nsmap
return property(fget=fget)
@dynamic_property
def _opf1_nsmap(self):
def fget(self):
nsmap = self._nsmap
for key, value in nsmap.items():
if value in OPF_NSES or value in DC_NSES:
del nsmap[key]
return nsmap
return property(fget=fget)
@dynamic_property
def _opf2_nsmap(self):
def fget(self):
nsmap = self._nsmap
nsmap.update(OPF2_NSMAP)
return nsmap
return property(fget=fget)
def to_opf1(self, parent=None):
nsmap = self._opf1_nsmap
nsrmap = dict((value, key) for key, value in nsmap.items())
elem = element(parent, 'metadata', nsmap=nsmap)
dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
xmeta = element(elem, 'x-metadata')
for term in self.items:
for item in self.items[term]:
item.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
if 'ms-chaptertour' not in self.items:
chaptertour = self.Item('ms-chaptertour', 'chaptertour')
chaptertour.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
return elem
def to_opf2(self, parent=None):
nsmap = self._opf2_nsmap
nsrmap = dict((value, key) for key, value in nsmap.items())
elem = element(parent, OPF('metadata'), nsmap=nsmap)
for term in self.items:
for item in self.items[term]:
item.to_opf2(elem, nsrmap=nsrmap)
return elem
class Manifest(object):
"""Collection of files composing an OEB data model book.
Provides access to the content of the files composing the book and
attributes associated with those files, including their internal paths,
unique identifiers, and MIME types.
Itself acts as a :class:`set` of manifest items, and provides the following
instance data member for dictionary-like access:
:attr:`ids`: A dictionary in which the keys are the unique identifiers of
the manifest items and the values are the items themselves.
:attr:`hrefs`: A dictionary in which the keys are the internal paths of the
manifest items and the values are the items themselves.
"""
class Item(object):
"""An OEB data model book content file.
Provides the following data members for accessing the file content and
metadata associated with this particular file.
:attr:`id`: Unique identifier.
:attr:`href`: Book-internal path.
:attr:`media_type`: MIME type of the file content.
:attr:`fallback`: Unique id of any fallback manifest item associated
with this manifest item.
:attr:`spine_position`: Display/reading order index for book textual
content. `None` for manifest items which are not part of the
book's textual content.
:attr:`linear`: `True` for textual content items which are part of the
primary linear reading order and `False` for textual content items
which are not (such as footnotes). Meaningless for items which
have a :attr:`spine_position` of `None`.
"""
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
def __init__(self, oeb, id, href, media_type,
fallback=None, loader=str, data=None):
if href:
href = unicode(href)
self.oeb = oeb
self.id = id
self.href = self.path = urlnormalize(href)
self.media_type = media_type
self.fallback = fallback
self.override_css_fetch = None
self.spine_position = None
self.linear = True
if loader is None and data is None:
loader = oeb.container.read
self._loader = loader
self._data = data
def __repr__(self):
return u'Item(id=%r, href=%r, media_type=%r)' \
% (self.id, self.href, self.media_type)
# Parsing {{{
def _parse_xml(self, data):
data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
if not data:
return None
return etree.fromstring(data, parser=RECOVER_PARSER)
def _parse_xhtml(self, data):
orig_data = data
fname = urlunquote(self.href)
self.oeb.log.debug('Parsing', fname, '...')
try:
data = parse_html(data, log=self.oeb.log,
decoder=self.oeb.decode,
preprocessor=self.oeb.html_preprocessor,
filename=fname, non_html_file_tags={'ncx'})
except NotHTML:
return self._parse_xml(orig_data)
return data
def _parse_txt(self, data):
if '<html>' in data:
return self._parse_xhtml(data)
self.oeb.log.debug('Converting', self.href, '...')
from calibre.ebooks.txt.processor import convert_markdown
title = self.oeb.metadata.title
if title:
title = unicode(title[0])
else:
title = _('Unknown')
return self._parse_xhtml(convert_markdown(data, title=title))
def _parse_css(self, data):
from cssutils import CSSParser, log, resolveImports
log.setLevel(logging.WARN)
log.raiseExceptions = False
self.oeb.log.debug('Parsing', self.href, '...')
data = self.oeb.decode(data)
data = self.oeb.css_preprocessor(data, add_namespace=True)
parser = CSSParser(loglevel=logging.WARNING,
fetcher=self.override_css_fetch or self._fetch_css,
log=_css_logger)
data = parser.parseString(data, href=self.href, validate=False)
data = resolveImports(data)
data.namespaces['h'] = XHTML_NS
return data
def _fetch_css(self, path):
hrefs = self.oeb.manifest.hrefs
if path not in hrefs:
self.oeb.logger.warn('CSS import of missing file %r' % path)
return (None, None)
item = hrefs[path]
if item.media_type not in OEB_STYLES:
self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
return (None, None)
data = item.data.cssText
enc = None if isinstance(data, unicode) else 'utf-8'
return (enc, data)
# }}}
@dynamic_property
def data(self):
doc = """Provides MIME type sensitive access to the manifest
entry's associated content.
- XHTML, HTML, and variant content is parsed as necessary to
convert and and return as an lxml.etree element in the XHTML
namespace.
- XML content is parsed and returned as an lxml.etree element.
- CSS and CSS-variant content is parsed and returned as a cssutils
CSS DOM stylesheet.
- All other content is returned as a :class:`str` object with no
special parsing.
"""
def fget(self):
data = self._data
if data is None:
if self._loader is None:
return None
data = self._loader(getattr(self, 'html_input_href',
self.href))
if not isinstance(data, basestring):
pass # already parsed
elif self.media_type.lower() in OEB_DOCS:
data = self._parse_xhtml(data)
elif self.media_type.lower()[-4:] in ('+xml', '/xml'):
data = self._parse_xml(data)
elif self.media_type.lower() in OEB_STYLES:
data = self._parse_css(data)
elif self.media_type.lower() == 'text/plain':
self.oeb.log.warn('%s contains data in TXT format'%self.href,
'converting to HTML')
data = self._parse_txt(data)
self.media_type = XHTML_MIME
self._data = data
return data
def fset(self, value):
self._data = value
def fdel(self):
self._data = None
return property(fget, fset, fdel, doc=doc)
def unload_data_from_memory(self, memory=None):
if isinstance(self._data, (str, bytes)):
if memory is None:
from calibre.ptempfile import PersistentTemporaryFile
pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img')
with pt:
pt.write(self._data)
self.oeb._temp_files.append(pt.name)
def loader(*args):
with open(pt.name, 'rb') as f:
ans = f.read()
os.remove(pt.name)
return ans
self._loader = loader
else:
def loader2(*args):
with open(memory, 'rb') as f:
ans = f.read()
return ans
self._loader = loader2
self._data = None
def __str__(self):
return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
def __unicode__(self):
data = self.data
if isinstance(data, etree._Element):
return xml2unicode(data, pretty_print=self.oeb.pretty_print)
if isinstance(data, unicode):
return data
if hasattr(data, 'cssText'):
return data.cssText
return unicode(data)
def __eq__(self, other):
return id(self) == id(other)
def __ne__(self, other):
return not self.__eq__(other)
def __cmp__(self, other):
result = cmp(self.spine_position, other.spine_position)
if result != 0:
return result
smatch = self.NUM_RE.search(self.href)
sref = smatch.group(1) if smatch else self.href
snum = float(smatch.group(2)) if smatch else 0.0
skey = (sref, snum, self.id)
omatch = self.NUM_RE.search(other.href)
oref = omatch.group(1) if omatch else other.href
onum = float(omatch.group(2)) if omatch else 0.0
okey = (oref, onum, other.id)
return cmp(skey, okey)
def relhref(self, href):
"""Convert the URL provided in :param:`href` from a book-absolute
reference to a reference relative to this manifest item.
"""
if urlparse(href).scheme:
return href
if '/' not in self.href:
return href
base = os.path.dirname(self.href).split('/')
target, frag = urldefrag(href)
target = target.split('/')
for index in xrange(min(len(base), len(target))):
if base[index] != target[index]: break
else:
index += 1
relhref = (['..'] * (len(base) - index)) + target[index:]
relhref = '/'.join(relhref)
if frag:
relhref = '#'.join((relhref, frag))
return relhref
def abshref(self, href):
"""Convert the URL provided in :param:`href` from a reference
relative to this manifest item to a book-absolute reference.
"""
purl = urlparse(href)
scheme = purl.scheme
if scheme and scheme != 'file':
return href
purl = list(purl)
purl[0] = ''
href = urlunparse(purl)
path, frag = urldefrag(href)
if not path:
if frag:
return '#'.join((self.href, frag))
else:
return self.href
if '/' not in self.href:
return href
dirname = os.path.dirname(self.href)
href = os.path.join(dirname, href)
href = os.path.normpath(href).replace('\\', '/')
return href
def __init__(self, oeb):
self.oeb = oeb
self.items = set()
self.ids = {}
self.hrefs = {}
def add(self, id, href, media_type, fallback=None, loader=None, data=None):
"""Add a new item to the book manifest.
The item's :param:`id`, :param:`href`, and :param:`media_type` are all
required. A :param:`fallback` item-id is required for any items with a
MIME type which is not one of the OPS core media types. Either the
item's data itself may be provided with :param:`data`, or a loader
function for the data may be provided with :param:`loader`, or the
item's data may later be set manually via the :attr:`data` attribute.
"""
item = self.Item(
self.oeb, id, href, media_type, fallback, loader, data)
self.items.add(item)
self.ids[item.id] = item
self.hrefs[item.href] = item
return item
def remove(self, item):
"""Removes :param:`item` from the manifest."""
if item in self.ids:
item = self.ids[item]
del self.ids[item.id]
if item.href in self.hrefs:
del self.hrefs[item.href]
self.items.remove(item)
if item in self.oeb.spine:
self.oeb.spine.remove(item)
def remove_duplicate_item(self, item):
if item in self.ids:
item = self.ids[item]
del self.ids[item.id]
self.items.remove(item)
def generate(self, id=None, href=None):
"""Generate a new unique identifier and/or internal path for use in
creating a new manifest item, using the provided :param:`id` and/or
:param:`href` as bases.
Returns an two-tuple of the new id and path. If either :param:`id` or
:param:`href` are `None` then the corresponding item in the return
tuple will also be `None`.
"""
if id is not None:
base = id
index = 1
while id in self.ids:
id = base + str(index)
index += 1
if href is not None:
href = urlnormalize(href)
base, ext = os.path.splitext(href)
index = 1
lhrefs = set([x.lower() for x in self.hrefs])
while href.lower() in lhrefs:
href = base + str(index) + ext
index += 1
return id, unicode(href)
def __iter__(self):
for item in self.items:
yield item
def __len__(self):
return len(self.items)
def values(self):
return list(self.items)
def __contains__(self, item):
return item in self.items
def to_opf1(self, parent=None):
elem = element(parent, 'manifest')
for item in self.items:
media_type = item.media_type
if media_type in OEB_DOCS:
media_type = OEB_DOC_MIME
elif media_type in OEB_STYLES:
media_type = OEB_CSS_MIME
attrib = {'id': item.id, 'href': urlunquote(item.href),
'media-type': media_type}
if item.fallback:
attrib['fallback'] = item.fallback
element(elem, 'item', attrib=attrib)
return elem
def to_opf2(self, parent=None):
def sort(x, y):
return cmp(x.href, y.href)
elem = element(parent, OPF('manifest'))
for item in sorted(self.items, cmp=sort):
media_type = item.media_type
if media_type in OEB_DOCS:
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
attrib = {'id': item.id, 'href': urlunquote(item.href),
'media-type': media_type}
if item.fallback:
attrib['fallback'] = item.fallback
element(elem, OPF('item'), attrib=attrib)
return elem
@dynamic_property
def main_stylesheet(self):
def fget(self):
ans = getattr(self, '_main_stylesheet', None)
if ans is None:
for item in self:
if item.media_type.lower() in OEB_STYLES:
ans = item
break
return ans
def fset(self, item):
self._main_stylesheet = item
return property(fget=fget, fset=fset)
class Spine(object):
"""Collection of manifest items composing an OEB data model book's main
textual content.
The spine manages which manifest items compose the book's main textual
content and the sequence in which they appear. Provides Python container
access as a list-like object.
"""
def __init__(self, oeb):
self.oeb = oeb
self.items = []
def _linear(self, linear):
if isinstance(linear, basestring):
linear = linear.lower()
if linear is None or linear in ('yes', 'true'):
linear = True
elif linear in ('no', 'false'):
linear = False
return linear
def add(self, item, linear=None):
"""Append :param:`item` to the end of the `Spine`."""
item.linear = self._linear(linear)
item.spine_position = len(self.items)
self.items.append(item)
return item
def insert(self, index, item, linear):
"""Insert :param:`item` at position :param:`index` in the `Spine`."""
item.linear = self._linear(linear)
item.spine_position = index
self.items.insert(index, item)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
return item
def remove(self, item):
"""Remove :param:`item` from the `Spine`."""
index = item.spine_position
self.items.pop(index)
for i in xrange(index, len(self.items)):
self.items[i].spine_position = i
item.spine_position = None
def index(self, item):
for i, x in enumerate(self):
if item == x:
return i
return -1
def __iter__(self):
for item in self.items:
yield item
def __getitem__(self, index):
return self.items[index]
def __len__(self):
return len(self.items)
def __contains__(self, item):
return (item in self.items)
def to_opf1(self, parent=None):
elem = element(parent, 'spine')
for item in self.items:
if item.linear:
element(elem, 'itemref', attrib={'idref': item.id})
return elem
def to_opf2(self, parent=None):
elem = element(parent, OPF('spine'))
for item in self.items:
attrib = {'idref': item.id}
if not item.linear:
attrib['linear'] = 'no'
element(elem, OPF('itemref'), attrib=attrib)
return elem
class Guide(object):
"""Collection of references to standard frequently-occurring sections
within an OEB data model book.
Provides dictionary-like access, in which the keys are the OEB reference
type identifiers and the values are `Reference` objects.
"""
class Reference(object):
"""Reference to a standard book section.
Provides the following instance data members:
:attr:`type`: Reference type identifier, as chosen from the list
allowed in the OPF 2.0 specification.
:attr:`title`: Human-readable section title.
:attr:`href`: Book-internal URL of the referenced section. May include
a fragment identifier.
"""
_TYPES_TITLES = [('cover', __('Cover')),
('title-page', __('Title Page')),
('toc', __('Table of Contents')),
('index', __('Index')),
('glossary', __('Glossary')),
('acknowledgements', __('Acknowledgements')),
('bibliography', __('Bibliography')),
('colophon', __('Colophon')),
('copyright-page', __('Copyright')),
('dedication', __('Dedication')),
('epigraph', __('Epigraph')),
('foreword', __('Foreword')),
('loi', __('List of Illustrations')),
('lot', __('List of Tables')),
('notes', __('Notes')),
('preface', __('Preface')),
('text', __('Main Text'))]
TYPES = set(t for t, _ in _TYPES_TITLES)
TITLES = dict(_TYPES_TITLES)
ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES))
def __init__(self, oeb, type, title, href):
self.oeb = oeb
if type.lower() in self.TYPES:
type = type.lower()
elif type not in self.TYPES and \
not type.startswith('other.'):
type = 'other.' + type
if not title and type in self.TITLES:
title = oeb.translate(self.TITLES[type])
self.type = type
self.title = title
self.href = urlnormalize(href)
def __repr__(self):
return 'Reference(type=%r, title=%r, href=%r)' \
% (self.type, self.title, self.href)
@dynamic_property
def _order(self):
def fget(self):
return self.ORDER.get(self.type, self.type)
return property(fget=fget)
def __cmp__(self, other):
if not isinstance(other, Guide.Reference):
return NotImplemented
return cmp(self._order, other._order)
@dynamic_property
def item(self):
doc = """The manifest item associated with this reference."""
def fget(self):
path = urldefrag(self.href)[0]
hrefs = self.oeb.manifest.hrefs
return hrefs.get(path, None)
return property(fget=fget, doc=doc)
def __init__(self, oeb):
self.oeb = oeb
self.refs = {}
def add(self, type, title, href):
"""Add a new reference to the `Guide`."""
if href:
href = unicode(href)
ref = self.Reference(self.oeb, type, title, href)
self.refs[type] = ref
return ref
def remove(self, type):
return self.refs.pop(type, None)
def iterkeys(self):
for type in self.refs:
yield type
__iter__ = iterkeys
def values(self):
return sorted(self.refs.values())
def items(self):
for type, ref in self.refs.items():
yield type, ref
def __getitem__(self, key):
return self.refs[key]
def __delitem__(self, key):
del self.refs[key]
def __contains__(self, key):
return key in self.refs
def __len__(self):
return len(self.refs)
def to_opf1(self, parent=None):
elem = element(parent, 'guide')
for ref in self.refs.values():
attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
if ref.title:
attrib['title'] = ref.title
element(elem, 'reference', attrib=attrib)
return elem
def to_opf2(self, parent=None):
elem = element(parent, OPF('guide'))
for ref in self.refs.values():
attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
if ref.title:
attrib['title'] = ref.title
element(elem, OPF('reference'), attrib=attrib)
return elem
class TOC(object):
"""Represents a hierarchical table of contents or navigation tree for
accessing arbitrary semantic sections within an OEB data model book.
Acts as a node within the navigation tree. Provides list-like access to
sub-nodes. Provides the follow node instance data attributes:
:attr:`title`: The title of this navigation node.
:attr:`href`: Book-internal URL referenced by this node.
:attr:`klass`: Optional semantic class referenced by this node.
:attr:`id`: Option unique identifier for this node.
:attr:`author`: Optional author attribution for periodicals <mbp:>
:attr:`description`: Optional description attribute for periodicals <mbp:>
:attr:`toc_thumbnail`: Optional toc thumbnail image
"""
def __init__(self, title=None, href=None, klass=None, id=None,
play_order=None, author=None, description=None, toc_thumbnail=None):
self.title = title
self.href = urlnormalize(href) if href else href
self.klass = klass
self.id = id
self.nodes = []
self.play_order = 0
if play_order is None:
play_order = self.next_play_order()
self.play_order = play_order
self.author = author
self.description = description
self.toc_thumbnail = toc_thumbnail
def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
"""Create and return a new sub-node of this node."""
node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
self.nodes.append(node)
return node
def remove(self, node):
for child in self.nodes:
if child is node:
self.nodes.remove(child)
return True
else:
if child.remove(node):
return True
return False
def iter(self):
"""Iterate over this node and all descendants in depth-first order."""
yield self
for child in self.nodes:
for node in child.iter():
yield node
def count(self):
return len(list(self.iter())) - 1
def next_play_order(self):
entries = [x.play_order for x in self.iter()]
base = max(entries) if entries else 0
return base+1
def has_href(self, href):
for x in self.iter():
if x.href == href:
return True
return False
def has_text(self, text):
for x in self.iter():
if x.title and x.title.lower() == text.lower():
return True
return False
def iterdescendants(self, breadth_first=False):
"""Iterate over all descendant nodes in depth-first order."""
if breadth_first:
for child in self.nodes:
yield child
for child in self.nodes:
for node in child.iterdescendants(breadth_first=True):
yield node
else:
for child in self.nodes:
for node in child.iter():
yield node
def __iter__(self):
"""Iterate over all immediate child nodes."""
for node in self.nodes:
yield node
def __getitem__(self, index):
return self.nodes[index]
def autolayer(self):
"""Make sequences of children pointing to the same content file into
children of the first node referencing that file.
"""
prev = None
for node in list(self.nodes):
if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:
self.nodes.remove(node)
prev.nodes.append(node)
else:
prev = node
def depth(self):
"""The maximum depth of the navigation tree rooted at this node."""
try:
return max(node.depth() for node in self.nodes) + 1
except ValueError:
return 1
def get_lines(self, lvl=0):
ans = [(u'\t'*lvl) + u'TOC: %s --> %s'%(self.title, self.href)]
for child in self:
ans.extend(child.get_lines(lvl+1))
return ans
def __str__(self):
return b'\n'.join([x.encode('utf-8') for x in self.get_lines()])
def __unicode__(self):
return u'\n'.join(self.get_lines())
def to_opf1(self, tour):
for node in self.nodes:
element(tour, 'site', attrib={
'title': node.title, 'href': urlunquote(node.href)})
node.to_opf1(tour)
return tour
def to_ncx(self, parent=None):
if parent is None:
parent = etree.Element(NCX('navMap'))
for node in self.nodes:
id = node.id or uuid_id()
po = node.play_order
if po == 0:
po = 1
attrib = {'id': id, 'playOrder': str(po)}
if node.klass:
attrib['class'] = node.klass
point = element(parent, NCX('navPoint'), attrib=attrib)
label = etree.SubElement(point, NCX('navLabel'))
title = node.title
if title:
title = re.sub(r'\s+', ' ', title)
element(label, NCX('text')).text = title
# Do not unescape this URL as ADE requires it to be escaped to
# handle semi colons and other special characters in the file names
element(point, NCX('content'), src=node.href)
node.to_ncx(point)
return parent
def rationalize_play_orders(self):
'''
Ensure that all nodes with the same play_order have the same href and
with different play_orders have different hrefs.
'''
def po_node(n):
for x in self.iter():
if x is n:
return
if x.play_order == n.play_order:
return x
def href_node(n):
for x in self.iter():
if x is n:
return
if x.href == n.href:
return x
for x in self.iter():
y = po_node(x)
if y is not None:
if x.href != y.href:
x.play_order = getattr(href_node(x), 'play_order',
self.next_play_order())
y = href_node(x)
if y is not None:
x.play_order = y.play_order
class PageList(object):
"""Collection of named "pages" to mapped positions within an OEB data model
book's textual content.
Provides list-like access to the pages.
"""
class Page(object):
"""Represents a mapping between a page name and a position within
the book content.
Provides the following instance data attributes:
:attr:`name`: The name of this page. Generally a number.
:attr:`href`: Book-internal URL at which point this page begins.
:attr:`type`: Must be one of 'front' (for prefatory pages, as commonly
labeled in print with small-case Roman numerals), 'normal' (for
standard pages, as commonly labeled in print with Arabic numerals),
or 'special' (for other pages, as commonly not labeled in any
fashion in print, such as the cover and title pages).
:attr:`klass`: Optional semantic class of this page.
:attr:`id`: Optional unique identifier for this page.
"""
TYPES = set(['front', 'normal', 'special'])
def __init__(self, name, href, type='normal', klass=None, id=None):
self.name = unicode(name)
self.href = urlnormalize(href)
self.type = type if type in self.TYPES else 'normal'
self.id = id
self.klass = klass
def __init__(self):
self.pages = []
def add(self, name, href, type='normal', klass=None, id=None):
"""Create a new page and add it to the `PageList`."""
page = self.Page(name, href, type, klass, id)
self.pages.append(page)
return page
def __len__(self):
return len(self.pages)
def __iter__(self):
for page in self.pages:
yield page
def __getitem__(self, index):
return self.pages[index]
def pop(self, index=-1):
return self.pages.pop(index)
def remove(self, page):
return self.pages.remove(page)
def to_ncx(self, parent=None):
plist = element(parent, NCX('pageList'), id=uuid_id())
values = dict((t, count(1)) for t in ('front', 'normal', 'special'))
for page in self.pages:
id = page.id or uuid_id()
type = page.type
value = str(values[type].next())
attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
if page.klass:
attrib['class'] = page.klass
ptarget = element(plist, NCX('pageTarget'), attrib=attrib)
label = element(ptarget, NCX('navLabel'))
element(label, NCX('text')).text = page.name
element(ptarget, NCX('content'), src=page.href)
return plist
def to_page_map(self):
pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS})
for page in self.pages:
element(pmap, OPF('page'), name=page.name, href=page.href)
return pmap
class OEBBook(object):
"""Representation of a book in the IDPF OEB data model."""
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
def __init__(self, logger,
html_preprocessor,
css_preprocessor=CSSPreProcessor(),
encoding='utf-8', pretty_print=False,
input_encoding='utf-8'):
"""Create empty book. Arguments:
:param:`encoding`: Default encoding for textual content read
from an external container.
:param:`pretty_print`: Whether or not the canonical string form
of XML markup is pretty-printed.
:param html_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all html files
before they are parsed.
:param css_preprocessor: A callable that takes a unicode object
and returns a unicode object. Will be called on all CSS files
before they are parsed.
:param:`logger`: A Log object to use for logging all messages
related to the processing of this book. It is accessible
via the instance data members :attr:`logger,log`.
It provides the following public instance data members for
accessing various parts of the OEB data model:
:attr:`metadata`: Metadata such as title, author name(s), etc.
:attr:`manifest`: Manifest of all files included in the book,
including MIME types and fallback information.
:attr:`spine`: In-order list of manifest items which compose
the textual content of the book.
:attr:`guide`: Collection of references to standard positions
within the text, such as the cover, preface, etc.
:attr:`toc`: Hierarchical table of contents.
:attr:`pages`: List of "pages," such as indexed to a print edition of
the same text.
"""
_css_log_handler.log = logger
self.encoding = encoding
self.input_encoding = input_encoding
self.html_preprocessor = html_preprocessor
self.css_preprocessor = css_preprocessor
self.pretty_print = pretty_print
self.logger = self.log = logger
self.version = '2.0'
self.container = NullContainer(self.log)
self.metadata = Metadata(self)
self.uid = None
self.manifest = Manifest(self)
self.spine = Spine(self)
self.guide = Guide(self)
self.toc = TOC()
self.pages = PageList()
self.auto_generated_toc = True
self._temp_files = []
def clean_temp_files(self):
for path in self._temp_files:
try:
os.remove(path)
except:
pass
@classmethod
def generate(cls, opts):
"""Generate an OEBBook instance from command-line options."""
encoding = opts.encoding
pretty_print = opts.pretty_print
return cls(encoding=encoding, pretty_print=pretty_print)
def translate(self, text):
"""Translate :param:`text` into the book's primary language."""
lang = str(self.metadata.language[0])
lang = lang.split('-', 1)[0].lower()
return translate(lang, text)
def decode(self, data):
"""Automatically decode :param:`data` into a `unicode` object."""
def fix_data(d):
return d.replace('\r\n', '\n').replace('\r', '\n')
if isinstance(data, unicode):
return fix_data(data)
bom_enc = None
if data[:4] in ('\0\0\xfe\xff', '\xff\xfe\0\0'):
bom_enc = {'\0\0\xfe\xff':'utf-32-be',
'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
data = data[4:]
elif data[:2] in ('\xff\xfe', '\xfe\xff'):
bom_enc = {'\xff\xfe':'utf-16-le', '\xfe\xff':'utf-16-be'}[data[:2]]
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
bom_enc = 'utf-8'
data = data[3:]
if bom_enc is not None:
try:
return fix_data(data.decode(bom_enc))
except UnicodeDecodeError:
pass
if self.input_encoding:
try:
return fix_data(data.decode(self.input_encoding, 'replace'))
except UnicodeDecodeError:
pass
try:
return fix_data(data.decode('utf-8'))
except UnicodeDecodeError:
pass
data, _ = xml_to_unicode(data)
return fix_data(data)
def to_opf1(self):
"""Produce OPF 1.2 representing the book's metadata and structure.
Returns a dictionary in which the keys are MIME types and the values
are tuples of (default) filenames and lxml.etree element structures.
"""
package = etree.Element('package',
attrib={'unique-identifier': self.uid.id})
self.metadata.to_opf1(package)
self.manifest.to_opf1(package)
self.spine.to_opf1(package)
tours = element(package, 'tours')
tour = element(tours, 'tour',
attrib={'id': 'chaptertour', 'title': 'Chapter Tour'})
self.toc.to_opf1(tour)
self.guide.to_opf1(package)
return {OPF_MIME: ('content.opf', package)}
def _update_playorder(self, ncx):
hrefs = set(map(urlnormalize, xpath(ncx, '//ncx:content/@src')))
playorder = {}
next = 1
selector = XPath('h:body//*[@id or @name]')
for item in self.spine:
base = item.href
if base in hrefs:
playorder[base] = next
next += 1
for elem in selector(item.data):
added = False
for attr in ('id', 'name'):
id = elem.get(attr)
if not id:
continue
href = '#'.join([base, id])
if href in hrefs:
playorder[href] = next
added = True
if added:
next += 1
selector = XPath('ncx:content/@src')
for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]')):
href = urlnormalize(selector(elem)[0])
order = playorder.get(href, i)
elem.attrib['playOrder'] = str(order)
return
def _to_ncx(self):
lang = unicode(self.metadata.language[0])
lang = lang.replace('_', '-')
ncx = etree.Element(NCX('ncx'),
attrib={'version': '2005-1', XML('lang'): lang},
nsmap={None: NCX_NS})
head = etree.SubElement(ncx, NCX('head'))
etree.SubElement(head, NCX('meta'),
name='dtb:uid', content=unicode(self.uid))
etree.SubElement(head, NCX('meta'),
name='dtb:depth', content=str(self.toc.depth()))
generator = ''.join(['calibre (', __version__, ')'])
etree.SubElement(head, NCX('meta'),
name='dtb:generator', content=generator)
etree.SubElement(head, NCX('meta'),
name='dtb:totalPageCount', content=str(len(self.pages)))
maxpnum = etree.SubElement(head, NCX('meta'),
name='dtb:maxPageNumber', content='0')
title = etree.SubElement(ncx, NCX('docTitle'))
text = etree.SubElement(title, NCX('text'))
text.text = unicode(self.metadata.title[0])
navmap = etree.SubElement(ncx, NCX('navMap'))
self.toc.to_ncx(navmap)
if len(self.pages) > 0:
plist = self.pages.to_ncx(ncx)
value = max(int(x) for x in xpath(plist, '//@value'))
maxpnum.attrib['content'] = str(value)
self._update_playorder(ncx)
return ncx
def to_opf2(self, page_map=False):
"""Produce OPF 2.0 representing the book's metadata and structure.
Returns a dictionary in which the keys are MIME types and the values
are tuples of (default) filenames and lxml.etree element structures.
"""
results = {}
package = etree.Element(OPF('package'),
attrib={'version': '2.0', 'unique-identifier': self.uid.id},
nsmap={None: OPF2_NS})
self.metadata.to_opf2(package)
manifest = self.manifest.to_opf2(package)
spine = self.spine.to_opf2(package)
self.guide.to_opf2(package)
results[OPF_MIME] = ('content.opf', package)
id, href = self.manifest.generate('ncx', 'toc.ncx')
etree.SubElement(manifest, OPF('item'), id=id, href=href,
attrib={'media-type': NCX_MIME})
spine.attrib['toc'] = id
results[NCX_MIME] = (href, self._to_ncx())
if page_map and len(self.pages) > 0:
id, href = self.manifest.generate('page-map', 'page-map.xml')
etree.SubElement(manifest, OPF('item'), id=id, href=href,
attrib={'media-type': PAGE_MAP_MIME})
spine.attrib['page-map'] = id
results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())
return results