diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 06f5ebdabb..7f7bafb99d 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,21 +7,25 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, cStringIO, os +import sys, struct, os import functools import re from urlparse import urldefrag +from cStringIO import StringIO from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] msdes, msdeserror = plugins['msdes'] +__all__ = ["LitReader"] + XML_DECL = """ """ OPF_DECL = """ @@ -109,6 +113,9 @@ def consume_sized_utf8_string(bytes, zpad=False): pos += 1 return u''.join(result), bytes[pos:] +def encode(string): + return unicode(string).encode('ascii', 'xmlcharrefreplace') + class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') @@ -119,13 +126,13 @@ class UnBinary(object): def __init__(self, bin, path, manifest={}, map=HTML_MAP): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map - self.opf = map is OPF_MAP - self.bin = bin + self.is_html = map is HTML_MAP self.dir = os.path.dirname(path) - self.buf = cStringIO.StringIO() - self.binary_to_text() - self.raw = self.buf.getvalue().lstrip().decode('utf-8') + buf = StringIO() + self.binary_to_text(bin, buf) + self.raw = buf.getvalue().lstrip() self.escape_reserved() + self._tree = None def escape_reserved(self): raw = self.raw @@ -152,18 +159,20 @@ class UnBinary(object): return '/'.join(relpath) def __unicode__(self): + return self.raw.decode('utf-8') + + def __str__(self): return self.raw - - def binary_to_text(self, base=0, depth=0): + + def binary_to_text(self, bin, buf, index=0, depth=0): tag_name = current_map = None dynamic_tag = errors = 0 in_censorship = is_goingdown = False state = 'text' - index = base flags = 0 - while index < len(self.bin): - c, index = read_utf8_char(self.bin, index) + while index < len(bin): + c, index = read_utf8_char(bin, index) oc = ord(c) if state == 'text': @@ -176,7 +185,7 @@ class UnBinary(object): c = '>>' elif c == '<': c = '<<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) elif state == 'get flags': if oc == 0: @@ -189,7 +198,7 @@ class UnBinary(object): state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc - self.buf.write('<') + buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: @@ -206,7 +215,7 @@ class UnBinary(object): tag_name = '?'+unichr(tag)+'?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag') @@ -218,15 +227,14 @@ class UnBinary(object): if not is_goingdown: tag_name = None dynamic_tag = 0 - self.buf.write(' />') + buf.write(' />') else: - self.buf.write('>') - index = self.binary_to_text(base=index, depth=depth+1) + buf.write('>') + index = self.binary_to_text(bin, buf, index, depth+1) is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write(u''.join( - ('')).encode('utf-8')) + buf.write(encode(u''.join(('')))) dynamic_tag = 0 tag_name = None state = 'text' @@ -246,7 +254,7 @@ class UnBinary(object): in_censorship = True state = 'get value length' continue - self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') + buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: @@ -254,40 +262,39 @@ class UnBinary(object): elif state == 'get value length': if not in_censorship: - self.buf.write('"') + buf.write('"') count = oc - 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue - if count < 0 or count > (len(self.bin) - index): + if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: - self.buf.write('%s"' % (oc - 1)) + buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(c.encode( - 'ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) count -= 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 - if count <= 0 or count > len(self.bin)-index: + if count <= 0 or count > len(bin)-index: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' @@ -297,26 +304,26 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - self.buf.write(' ') + buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': - self.buf.write(unicode(c).encode('utf-8')) + buf.write(encode(c)) count -= 1 if count == 0: - self.buf.write('=') + buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' @@ -330,10 +337,11 @@ class UnBinary(object): if frag: path = '#'.join((path, frag)) path = urlnormalize(path) - self.buf.write((u'"%s"' % path).encode('utf-8')) + buf.write(encode(u'"%s"' % path)) state = 'get attr' return index + class DirectoryEntry(object): def __init__(self, name, section, offset, size): self.name = name @@ -348,6 +356,7 @@ class DirectoryEntry(object): def __str__(self): return repr(self) + class ManifestItem(object): def __init__(self, original, internal, mime_type, offset, root, state): self.original = original @@ -375,65 +384,87 @@ class ManifestItem(object): % (self.internal, self.path, self.mime_type, self.offset, self.root, self.state) + def preserve(function): def wrapper(self, *args, **kwargs): - opos = self._stream.tell() + opos = self.stream.tell() try: return function(self, *args, **kwargs) finally: - self._stream.seek(opos) + self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper -class LitReader(object): +class LitFile(object): PIECE_SIZE = 16 - XML_PARSER = etree.XMLParser( - recover=True, resolve_entities=False) + + def __init__(self, filename_or_stream): + if hasattr(filename_or_stream, 'read'): + self.stream = filename_or_stream + else: + self.stream = open(filename_or_stream, 'rb') + try: + self.opf_path = os.path.splitext( + os.path.basename(self.stream.name))[0] + '.opf' + except AttributeError: + self.opf_path = 'content.opf' + if self.magic != 'ITOLITLS': + raise LitError('Not a valid LIT file') + if self.version != 1: + raise LitError('Unknown LIT version %d' % (self.version,)) + self.read_secondary_header() + self.read_header_pieces() + self.read_section_names() + self.read_manifest() + self.read_drm() + + def warn(self, msg): + print "WARNING: %s" % (msg,) def magic(): @preserve def fget(self): - self._stream.seek(0) - return self._stream.read(8) + self.stream.seek(0) + return self.stream.read(8) return property(fget=fget) magic = magic() def version(): def fget(self): - self._stream.seek(8) - return u32(self._stream.read(4)) + self.stream.seek(8) + return u32(self.stream.read(4)) return property(fget=fget) version = version() def hdr_len(): @preserve def fget(self): - self._stream.seek(12) - return int32(self._stream.read(4)) + self.stream.seek(12) + return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() def num_pieces(): @preserve def fget(self): - self._stream.seek(16) - return int32(self._stream.read(4)) + self.stream.seek(16) + return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() def sec_hdr_len(): @preserve def fget(self): - self._stream.seek(20) - return int32(self._stream.read(4)) + self.stream.seek(20) + return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() def guid(): @preserve def fget(self): - self._stream.seek(24) - return self._stream.read(16) + self.stream.seek(24) + return self.stream.read(16) return property(fget=fget) guid = guid() @@ -443,44 +474,27 @@ class LitReader(object): size = self.hdr_len \ + (self.num_pieces * self.PIECE_SIZE) \ + self.sec_hdr_len - self._stream.seek(0) - return self._stream.read(size) + self.stream.seek(0) + return self.stream.read(size) return property(fget=fget) header = header() - def __init__(self, filename_or_stream): - if hasattr(filename_or_stream, 'read'): - self._stream = filename_or_stream - else: - self._stream = open(filename_or_stream, 'rb') - if self.magic != 'ITOLITLS': - raise LitError('Not a valid LIT file') - if self.version != 1: - raise LitError('Unknown LIT version %d' % (self.version,)) - self.entries = {} - self._read_secondary_header() - self._read_header_pieces() - self._read_section_names() - self._read_manifest() - self._read_meta() - self._read_drm() - @preserve def __len__(self): - self._stream.seek(0, 2) - return self._stream.tell() + self.stream.seek(0, 2) + return self.stream.tell() @preserve - def _read_raw(self, offset, size): - self._stream.seek(offset) - return self._stream.read(size) + def read_raw(self, offset, size): + self.stream.seek(offset) + return self.stream.read(size) - def _read_content(self, offset, size): - return self._read_raw(self.content_offset + offset, size) + def read_content(self, offset, size): + return self.read_raw(self.content_offset + offset, size) - def _read_secondary_header(self): + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) - bytes = self._read_raw(offset, self.sec_hdr_len) + bytes = self.read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset+4] @@ -508,21 +522,21 @@ class LitReader(object): if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - def _read_header_pieces(self): + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) - piece = self._read_raw(offset, size) + piece = self.read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') - self._read_directory(piece) + self.read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: @@ -533,12 +547,13 @@ class LitReader(object): elif i == 4: self.piece4_guid = piece - def _read_directory(self, piece): + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) if (32 + (num_chunks * chunk_size)) != len(piece): - raise LitError('IFCM HEADER has incorrect length') + raise LitError('IFCM header has incorrect length') + self.entries = {} for i in xrange(num_chunks): offset = 32 + (i * chunk_size) chunk = piece[offset:offset + chunk_size] @@ -572,17 +587,17 @@ class LitReader(object): entry = DirectoryEntry(name, section, offset, size) self.entries[name] = entry - def _read_section_names(self): + def read_section_names(self): if '::DataSpace/NameList' not in self.entries: raise LitError('Lit file does not have a valid NameList') raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 - self.num_sections = u16(raw[2:pos]) - self.section_names = [""]*self.num_sections - self.section_data = [None]*self.num_sections - for section in xrange(self.num_sections): + num_sections = u16(raw[2:pos]) + self.section_names = [""] * num_sections + self.section_data = [None] * num_sections + for section in xrange(num_sections): size = u16(raw[pos:pos+2]) pos += 2 size = size*2 + 2 @@ -592,11 +607,12 @@ class LitReader(object): raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size - def _read_manifest(self): + def read_manifest(self): if '/manifest' not in self.entries: raise LitError('Lit file does not have a valid manifest') raw = self.get_file('/manifest') self.manifest = {} + self.paths = {self.opf_path: None} while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break @@ -637,28 +653,9 @@ class LitReader(object): for item in mlist: if item.path[0] == '/': item.path = os.path.basename(item.path) + self.paths[item.path] = item - def _pretty_print(self, xml): - f = cStringIO.StringIO(xml.encode('utf-8')) - doc = etree.parse(f, parser=self.XML_PARSER) - pretty = etree.tostring(doc, encoding='ascii', pretty_print=True) - return XML_DECL + unicode(pretty) - - def _read_meta(self): - path = 'content.opf' - raw = self.get_file('/meta') - xml = OPF_DECL - try: - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - except LitError: - if 'PENGUIN group' not in raw: raise - print "WARNING: attempting PENGUIN malformed OPF fix" - raw = raw.replace( - 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - self.meta = xml - - def _read_drm(self): + def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 @@ -669,7 +666,7 @@ class LitReader(object): else: return if self.drmlevel < 5: - msdes.deskey(self._calculate_deskey(), msdes.DE1) + msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') @@ -677,7 +674,7 @@ class LitReader(object): else: raise DRMError("Cannot access DRM-protected book") - def _calculate_deskey(self): + def calculate_deskey(self): hashfiles = ['/meta', '/DRMStorage/DRMSource'] if self.drmlevel == 3: hashfiles.append('/DRMStorage/DRMBookplate') @@ -701,18 +698,18 @@ class LitReader(object): def get_file(self, name): entry = self.entries[name] if entry.section == 0: - return self._read_content(entry.offset, entry.size) + return self.read_content(entry.offset, entry.size) section = self.get_section(entry.section) return section[entry.offset:entry.offset+entry.size] def get_section(self, section): data = self.section_data[section] if not data: - data = self._get_section(section) + data = self.get_section_uncached(section) self.section_data[section] = data return data - def _get_section(self, section): + def get_section_uncached(self, section): name = self.section_names[section] path = '::DataSpace/Storage/' + name transform = self.get_file(path + '/Transform/List') @@ -724,29 +721,29 @@ class LitReader(object): raise LitError("ControlData is too short") guid = msguid(transform) if guid == DESENCRYPT_GUID: - content = self._decrypt(content) + content = self.decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file( '/'.join(('::DataSpace/Storage', name, 'Transform', LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) - content = self._decompress(content, control, reset_table) + content = self.decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) transform = transform[16:] return content - def _decrypt(self, content): + def decrypt(self, content): length = len(content) extra = length & 0x7 if extra > 0: - self._warn("content length not a multiple of block size") + self.warn("content length not a multiple of block size") content += "\0" * (8 - extra) msdes.deskey(self.bookkey, msdes.DE1) return msdes.des(content) - def _decompress(self, content, control, reset_table): + def decompress(self, content, control, reset_table): if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): @@ -787,7 +784,7 @@ class LitReader(object): result.append( lzx.decompress(content[base:size], window_bytes)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) @@ -797,80 +794,57 @@ class LitReader(object): try: result.append(lzx.decompress(content[base:], bytes_remaining)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") return ''.join(result) - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - content = decl + unicode(UnBinary(raw, path, self.manifest, map)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') + +class LitContainer(object): + """Simple Container-interface, read-only accessor for LIT files.""" + + def __init__(self, filename_or_stream): + self._litfile = LitFile(filename_or_stream) + + def namelist(self): + return self._litfile.paths.keys() + + def exists(self, name): + return urlunquote(name) in self._litfile.paths + + def read(self, name): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + content = OPF_DECL + self._read_meta() + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = HTML_DECL + str(unbin) else: - name = '/'.join(('/data', entry.internal)) - content = self.get_file(name) + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) return content - - def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): - output_dir = os.path.abspath(output_dir) + + def _read_meta(self): + path = 'content.opf' + raw = self._litfile.get_file('/meta') try: - opf_path = os.path.splitext( - os.path.basename(self._stream.name))[0] + '.opf' - except AttributeError: - opf_path = 'content.opf' - opf_path = os.path.join(output_dir, opf_path) - self._ensure_dir(opf_path) - with open(opf_path, 'wb') as f: - xml = self.meta - if pretty_print: - xml = self._pretty_print(xml) - f.write(xml.encode('utf-8')) - for entry in self.manifest.values(): - path = os.path.join(output_dir, entry.path) - self._ensure_dir(path) - with open(path, 'wb') as f: - f.write(self.get_entry_content(entry, pretty_print)) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + except LitError: + if 'PENGUIN group' not in raw: raise + print "WARNING: attempting PENGUIN malformed OPF fix" + raw = raw.replace( + 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + return str(unbin) - def _ensure_dir(self, path): - dir = os.path.dirname(path) - if not os.path.isdir(dir): - os.makedirs(dir) - def _warn(self, msg): - print "WARNING: %s" % (msg,) +class LitReader(OEBReader): + Container = LitContainer + DEFAULT_PROFILE = 'MSReader' -def option_parser(): - from calibre.utils.config import OptionParser - parser = OptionParser(usage=_('%prog [options] LITFILE')) - parser.add_option( - '-o', '--output-dir', default='.', - help=_('Output directory. Defaults to current directory.')) - parser.add_option( - '-p', '--pretty-print', default=False, action='store_true', - help=_('Legibly format extracted markup. May modify meaningful whitespace.')) - parser.add_option( - '--verbose', default=False, action='store_true', - help=_('Useful for debugging.')) - return parser - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - return 1 - lr = LitReader(args[1]) - lr.extract_content(opts.output_dir, opts.pretty_print) - print _('OEB ebook created in'), opts.output_dir - return 0 try: import psyco @@ -878,6 +852,3 @@ try: psyco.bind(UnBinary.binary_to_text) except ImportError: pass - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 4a059b6433..bebba8938b 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -312,7 +312,7 @@ class LitWriter(object): cover = None if oeb.metadata.cover: id = str(oeb.metadata.cover[0]) - cover = oeb.manifest[id] + cover = oeb.manifest.ids[id] for type, title in ALL_MS_COVER_TYPES: if type not in oeb.guide: oeb.guide.add(type, title, cover.href) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 7ecd127452..534366da7d 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -82,7 +82,20 @@ class MobiMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + group = cfg.add_group('mobiml', _('Mobipocket markup options.')) + group('ignore_tables', ['--ignore-tables'], default=False, + help=_('Render HTML tables as blocks of text instead of actual ' + 'tables. This is neccessary if the HTML contains very ' + 'large or complex tables.')) + return cfg + + @classmethod + def generate(cls, opts): + return cls(ignore_tables=opts.ignore_tables) + + def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb self.profile = profile = context.dest diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index d67bc099ef..fdabfaa618 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -292,9 +292,29 @@ class Serializer(object): buffer.seek(hoff) buffer.write('%010d' % ioff) - + +class MobiFlattener(object): + def config(self, cfg): + return cfg + + def generate(self, opts): + return self + + def __call__(self, oeb, context): + fbase = context.dest.fbase + fkey = context.dest.fnums.values() + flattener = CSSFlattener( + fbase=fbase, fkey=fkey, unfloat=True, untable=True) + return flattener(oeb, context) + + class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') + + DEFAULT_PROFILE = 'CybookG3' + + TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer, + ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, prefer_author_sort=False): @@ -302,7 +322,32 @@ class MobiWriter(object): self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort - def dump(self, oeb, path): + @classmethod + def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ + mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.')) + mobi('compress', ['--compress'], default=False, + help=_('Compress file text using PalmDOC compression. ' + 'Results in smaller files, but takes a long time to run.')) + mobi('rescale_images', ['--rescale-images'], default=False, + help=_('Modify images to meet Palm device size limitations.')) + mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, + help=_('When present, use the author sorting information for ' + 'generating the Mobipocket author metadata.')) + return cfg + + @classmethod + def generate(cls, opts): + """Generate a Writer instance from command-line options.""" + compression = PALMDOC if opts.compress else UNCOMPRESSED + imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None + prefer_author_sort = opts.prefer_author_sort + return cls(compression=compression, imagemax=imagemax, + prefer_author_sort=prefer_author_sort) + + def __call__(self, oeb, path): if hasattr(path, 'write'): return self._dump_stream(oeb, path) with open(path, 'w+b') as stream: @@ -542,21 +587,6 @@ def config(defaults=None): else: c = StringConfig(defaults, desc) - mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('compress', ['--compress'], default=False, - help=_('Compress file text using PalmDOC compression. ' - 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, - help=_('Modify images to meet Palm device size limitations.')) - mobi('toc_title', ['--toc-title'], default=None, - help=_('Title for any generated in-line table of contents.')) - mobi('ignore_tables', ['--ignore-tables'], default=False, - help=_('Render HTML tables as blocks of text instead of actual ' - 'tables. This is neccessary if the HTML contains very large ' - 'or complex tables.')) - mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, - help=_('When present, use the author sorting information for ' - 'generating the Mobipocket author metadata.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index fee96585db..2e160d1571 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -5,23 +5,20 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' +__docformat__ = 'restructuredtext en' -import os, sys, re, uuid, copy -from mimetypes import types_map, guess_type +import os, sys, re, uuid +from mimetypes import types_map from collections import defaultdict -from types import StringTypes -from itertools import izip, count, chain +from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree, html import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate -from calibre.startup import get_lang from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS -from calibre.ebooks.metadata.epub import CoverRenderer -from calibre.ptempfile import TemporaryDirectory XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -39,14 +36,13 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' -XPNSMAP = { - 'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, - 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, - 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, - 'svg': SVG_NS, 'xl' : XLINK_NS - } -DC_PREFIXES = ('d11', 'd10', 'd09') - +XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, + 'svg': SVG_NS, 'xl' : XLINK_NS} +OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} +OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS, 'calibre': CALIBRE_NS} def XML(name): return '{%s}%s' % (XML_NS, name) @@ -105,7 +101,8 @@ SVG_MIME = types_map['.svg'] BINARY_MIME = 'application/octet-stream' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) -OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) +OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, + 'text/x-oeb-document']) OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME]) OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) @@ -167,8 +164,9 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def xml2str(root): - return etree.tostring(root, encoding='utf-8', xml_declaration=True) +def xml2str(root, pretty_print=False): + return etree.tostring(root, encoding='utf-8', xml_declaration=True, + pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) @@ -178,6 +176,7 @@ URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE] def urlquote(href): + """Quote URL-unsafe characters, allowing IRI-safe characters.""" result = [] unsafe = 0 if isinstance(href, unicode) else 1 unsafe = URL_UNSAFE[unsafe] @@ -188,6 +187,9 @@ def urlquote(href): return ''.join(result) def urlnormalize(href): + """Convert a URL into normalized form, with all and only URL-unsafe + characters URL quoted. + """ parts = urlparse(href) if not parts.scheme: path, frag = urldefrag(href) @@ -199,30 +201,63 @@ def urlnormalize(href): class OEBError(Exception): + """Generic OEB-processing error.""" pass class FauxLogger(object): + """Fake logging interface.""" def __getattr__(self, name): return self def __call__(self, message): print message class Logger(LoggingInterface, object): + """A logging object which provides both the standard `logging.Logger` and + calibre-specific interfaces. + """ def __getattr__(self, name): return object.__getattribute__(self, 'log_' + name) -class AbstractContainer(object): - def read_xml(self, path): - return etree.fromstring( - self.read(path), base_url=os.path.dirname(path)) +class NullContainer(object): + """An empty container. -class DirContainer(AbstractContainer): - def __init__(self, rootdir): - self.rootdir = unicode(rootdir) + For use with book formats which do not support container-like access. + """ + def read(self, path): + raise OEBError('Attempt to read from NullContainer') + + def write(self, path): + raise OEBError('Attempt to write to NullContainer') + + def exists(self, path): + return False + + def namelist(self): + return [] + +class DirContainer(object): + """Filesystem directory container.""" + + def __init__(self, path): + path = unicode(path) + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = os.path.basename(path) + self.rootdir = os.path.dirname(path) + return + self.rootdir = path + for path in self.namelist(): + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = fname + return + self.opfname = None def read(self, path): + if path is None: + path = self.opfname path = os.path.join(self.rootdir, path) with open(urlunquote(path), 'rb') as f: return f.read() @@ -239,53 +274,49 @@ class DirContainer(AbstractContainer): path = os.path.join(self.rootdir, path) return os.path.isfile(urlunquote(path)) -class DirWriter(object): - def __init__(self, version='2.0', page_map=False): - self.version = version - self.page_map = page_map - - def dump(self, oeb, path): - version = int(self.version[0]) - opfname = None - if os.path.splitext(path)[1].lower() == '.opf': - opfname = os.path.basename(path) - path = os.path.dirname(path) - if not os.path.isdir(path): - os.mkdir(path) - output = DirContainer(path) - for item in oeb.manifest.values(): - output.write(item.href, str(item)) - if version == 1: - metadata = oeb.to_opf1() - elif version == 2: - metadata = oeb.to_opf2(page_map=self.page_map) - else: - raise OEBError("Unrecognized OPF version %r" % self.version) - for mime, (href, data) in metadata.items(): - if opfname and mime == OPF_MIME: - href = opfname - output.write(href, xml2str(data)) - return + def namelist(self): + names = [] + for root, dirs, files in os.walk(self.rootdir): + for fname in files: + fname = os.path.join(root, fname) + fname = fname.replace('\\', '/') + names.append(fname) + return names class Metadata(object): - DC_TERMS = set([ - 'contributor', 'coverage', 'creator', 'date', - 'description', 'format', 'identifier', 'language', - 'publisher', 'relation', 'rights', 'source', 'subject', - 'title', 'type' - ]) + """A collection of OEB data model metadata. + + Provides access to the list of items associated with a particular metadata + term via the term's local name using either Python container or attribute + syntax. Return an empty list for any terms with no currently associated + metadata items. + """ + + DC_TERMS = set(['contributor', 'coverage', 'creator', 'date', + 'description', 'format', 'identifier', 'language', + 'publisher', 'relation', 'rights', 'source', + 'subject', 'title', 'type']) CALIBRE_TERMS = set(['series', 'series_index', 'rating']) OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} - OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} - OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, - 'xsi': XSI_NS, 'calibre': CALIBRE_NS} class Item(object): - + """An item of OEB data model metadata. + + The metadata term or name may be accessed via the :attr:`term` or + :attr:`name` attributes. The metadata value or content may be accessed + via the :attr:`value` or :attr:`content` attributes, or via Unicode or + string representations of the object. + + OEB data model metadata attributes may be accessed either via their + fully-qualified names using the Python container access syntax, or via + their local names using Python attribute syntax. Only attributes + allowed by the OPF 2.0 specification are supported. + """ class Attribute(object): + """Smart accessor for allowed OEB metadata item attributes.""" def __init__(self, attr, allowed=None): if not callable(attr): @@ -336,19 +367,35 @@ class Metadata(object): nsattr = 'scheme' if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - - scheme = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'), - [DC('identifier'), OPF('meta')]) + + @dynamic_property + def name(self): + def fget(self): + return self.term + return property(fget=fget) + + @dynamic_property + def content(self): + def fget(self): + return self.value + def fset(self, value): + self.value = value + return property(fget=fget, fset=fset) + + scheme = Attribute(lambda term: 'scheme' if \ + term == OPF('meta') else OPF('scheme'), + [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) id = Attribute('id') - type = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')]) + type = Attribute(XSI('type'), [DC('date'), DC('format'), + DC('type')]) lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'), - DC('creator'), DC('publisher'), - DC('relation'), DC('rights'), - DC('source'), DC('subject'), - OPF('meta')]) + DC('creator'), DC('publisher'), + DC('relation'), DC('rights'), + DC('source'), DC('subject'), + OPF('meta')]) def __getitem__(self, key): return self.attrib[key] @@ -406,6 +453,7 @@ class Metadata(object): self.items = defaultdict(list) def add(self, term, value, attrib={}, nsmap={}, **kwargs): + """Add a new metadata item.""" item = self.Item(term, value, attrib, nsmap, **kwargs) items = self.items[barename(item.term)] items.append(item) @@ -445,21 +493,19 @@ class Metadata(object): return nsmap return property(fget=fget) - @dynamic_property def _opf2_nsmap(self): def fget(self): nsmap = self._nsmap - nsmap.update(self.OPF2_NSMAP) + nsmap.update(OPF2_NSMAP) return nsmap return property(fget=fget) - def to_opf1(self, parent=None): nsmap = self._opf1_nsmap nsrmap = dict((value, key) for key, value in nsmap.items()) elem = element(parent, 'metadata', nsmap=nsmap) - dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) xmeta = element(elem, 'x-metadata') for term in self.items: for item in self.items[term]: @@ -480,8 +526,40 @@ class Metadata(object): class Manifest(object): + """Collection of files composing an OEB data model book. + + Provides access to the content of the files composing the book and + attributes associated with those files, including their internal paths, + unique identifiers, and MIME types. + + Itself acts as a :class:`set` of manifest items, and provides the following + instance data member for dictionary-like access: + + :attr:`ids`: A dictionary in which the keys are the unique identifiers of + the manifest items and the values are the items themselves. + :attr:`hrefs`: A dictionary in which the keys are the internal paths of the + manifest items and the values are the items themselves. + """ class Item(object): + """An OEB data model book content file. + + Provides the following data members for accessing the file content and + metadata associated with this particular file. + + :attr:`id`: Unique identifier. + :attr:`href`: Book-internal path. + :attr:`media_type`: MIME type of the file content. + :attr:`fallback`: Unique id of any fallback manifest item associated + with this manifest item. + :attr:`spine_position`: Display/reading order index for book textual + content. `None` for manifest items which are not part of the + book's textual content. + :attr:`linear`: `True` for textual content items which are part of the + primary linear reading order and `False` for textual content items + which are not (such as footnotes). Meaningless for items which + have a :attr:`spine_position` of `None`. + """ NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') @@ -587,6 +665,18 @@ class Manifest(object): @dynamic_property def data(self): + doc = """Provides MIME type sensitive access to the manifest + entry's associated content. + + - XHTML, HTML, and variant content is parsed as necessary to + convert and and return as an lxml.etree element in the XHTML + namespace. + - XML content is parsed and returned as an lxml.etree element. + - CSS and CSS-variant content is parsed and returned as a cssutils + CSS DOM stylesheet. + - All other content is returned as a :class:`str` object with no + special parsing. + """ def fget(self): if self._data is not None: return self._data @@ -603,12 +693,12 @@ class Manifest(object): self._data = value def fdel(self): self._data = None - return property(fget, fset, fdel) + return property(fget, fset, fdel, doc=doc) def __str__(self): data = self.data if isinstance(data, etree._Element): - return xml2str(data) + return xml2str(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data.encode('utf-8') return str(data) @@ -634,6 +724,9 @@ class Manifest(object): return cmp(skey, okey) def relhref(self, href): + """Convert the URL provided in :param:`href` from a book-absolute + reference to a reference relative to this manifest item. + """ if urlparse(href).scheme: return href if '/' not in self.href: @@ -652,6 +745,9 @@ class Manifest(object): return relhref def abshref(self, href): + """Convert the URL provided in :param:`href` from a reference + relative to this manifest item to a book-absolute reference. + """ if urlparse(href).scheme: return href path, frag = urldefrag(href) @@ -666,25 +762,46 @@ class Manifest(object): def __init__(self, oeb): self.oeb = oeb + self.items = set() self.ids = {} self.hrefs = {} def add(self, id, href, media_type, fallback=None, loader=None, data=None): + """Add a new item to the book manifest. + + The item's :param:`id`, :param:`href`, and :param:`media_type` are all + required. A :param:`fallback` item-id is required for any items with a + MIME type which is not one of the OPS core media types. Either the + item's data itself may be provided with :param:`data`, or a loader + function for the data may be provided with :param:`loader`, or the + item's data may latter be set manually via the :attr:`data` attribute. + """ item = self.Item( self.oeb, id, href, media_type, fallback, loader, data) + self.items.add(item) self.ids[item.id] = item self.hrefs[item.href] = item return item def remove(self, item): + """Removes :param:`item` from the manifest.""" if item in self.ids: item = self.ids[item] del self.ids[item.id] del self.hrefs[item.href] + self.items.remove(item) if item in self.oeb.spine: self.oeb.spine.remove(item) def generate(self, id=None, href=None): + """Generate a new unique identifier and/or internal path for use in + creating a new manifest item, using the provided :param:`id` and/or + :param:`href` as bases. + + Returns an two-tuple of the new id and path. If either :param:`id` or + :param:`href` are `None` then the corresponding item in the return + tuple will also be `None`. + """ if id is not None: base = id index = 1 @@ -701,26 +818,18 @@ class Manifest(object): return id, href def __iter__(self): - for id in self.ids: - yield id - - def __getitem__(self, id): - return self.ids[id] - - def values(self): - for item in self.ids.values(): + for item in self.items: yield item - def items(self): - for id, item in self.ids.items(): - yield id, item + def values(self): + return list(self.items) - def __contains__(self, key): - return key in self.ids + def __contains__(self, item): + return item in self.items def to_opf1(self, parent=None): elem = element(parent, 'manifest') - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = OEB_DOC_MIME @@ -735,7 +844,7 @@ class Manifest(object): def to_opf2(self, parent=None): elem = element(parent, OPF('manifest')) - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = XHTML_MIME @@ -750,13 +859,19 @@ class Manifest(object): class Spine(object): - + """Collection of manifest items composing an OEB data model book's main + textual content. + + The spine manages which manifest items compose the book's main textual + content and the sequence in which they appear. Provides Python container + access as a list-like object. + """ def __init__(self, oeb): self.oeb = oeb self.items = [] def _linear(self, linear): - if isinstance(linear, StringTypes): + if isinstance(linear, basestring): linear = linear.lower() if linear is None or linear in ('yes', 'true'): linear = True @@ -765,12 +880,14 @@ class Spine(object): return linear def add(self, item, linear=None): + """Append :param:`item` to the end of the `Spine`.""" item.linear = self._linear(linear) item.spine_position = len(self.items) self.items.append(item) return item def insert(self, index, item, linear): + """Insert :param:`item` at position :param:`index` in the `Spine`.""" item.linear = self._linear(linear) item.spine_position = index self.items.insert(index, item) @@ -779,6 +896,7 @@ class Spine(object): return item def remove(self, item): + """Remove :param:`item` from the `Spine`.""" index = item.spine_position self.items.pop(index) for i in xrange(index, len(self.items)): @@ -816,9 +934,24 @@ class Spine(object): class Guide(object): + """Collection of references to standard frequently-occurring sections + within an OEB data model book. + + Provides dictionary-like access, in which the keys are the OEB reference + type identifiers and the values are `Reference` objects. + """ class Reference(object): - + """Reference to a standard book section. + + Provides the following instance data members: + + :attr:`type`: Reference type identifier, as chosen from the list + allowed in the OPF 2.0 specification. + :attr:`title`: Human-readable section title. + :attr:`href`: Book-internal URL of the referenced section. May include + a fragment identifier. + """ _TYPES_TITLES = [('cover', __('Cover')), ('title-page', __('Title Page')), ('toc', __('Table of Contents')), @@ -838,7 +971,7 @@ class Guide(object): ('text', __('Main Text'))] TYPES = set(t for t, _ in _TYPES_TITLES) TITLES = dict(_TYPES_TITLES) - ORDER = dict((t, i) for (t, _), i in izip(_TYPES_TITLES, count(0))) + ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES)) def __init__(self, oeb, type, title, href): self.oeb = oeb @@ -870,17 +1003,19 @@ class Guide(object): @dynamic_property def item(self): + doc = """The manifest item associated with this reference.""" def fget(self): path = urldefrag(self.href)[0] hrefs = self.oeb.manifest.hrefs return hrefs.get(path, None) - return property(fget=fget) + return property(fget=fget, doc=doc) def __init__(self, oeb): self.oeb = oeb self.refs = {} def add(self, type, title, href): + """Add a new reference to the `Guide`.""" ref = self.Reference(self.oeb, type, title, href) self.refs[type] = ref return ref @@ -928,8 +1063,19 @@ class Guide(object): return elem +# TODO: This needs beefing up to support the interface of toc.TOC class TOC(object): - # This needs beefing up to support the interface of toc.TOC + """Represents a hierarchical table of contents or navigation tree for + accessing arbitrary semantic sections within an OEB data model book. + + Acts as a node within the navigation tree. Provides list-like access to + sub-nodes. Provides the follow node instance data attributes: + + :attr:`title`: The title of this navigation node. + :attr:`href`: Book-internal URL referenced by this node. + :attr:`klass`: Optional semantic class referenced by this node. + :attr:`id`: Option unique identifier for this node. + """ def __init__(self, title=None, href=None, klass=None, id=None): self.title = title self.href = urlnormalize(href) if href else href @@ -938,17 +1084,26 @@ class TOC(object): self.nodes = [] def add(self, title, href, klass=None, id=None): + """Create and return a new sub-node of this node.""" node = TOC(title, href, klass, id) self.nodes.append(node) return node + def iter(self): + """Iterate over this node and all descendants in depth-first order.""" + yield self + for child in self.nodes: + for node in child.iter(): + yield node + def iterdescendants(self): - for node in self.nodes: - yield node - for child in node.iterdescendants(): - yield child + """Iterate over all descendant nodes in depth-first order.""" + for child in self.nodes: + for node in child.iter(): + yield node def __iter__(self): + """Iterate over all immediate child nodes.""" for node in self.nodes: yield node @@ -956,6 +1111,9 @@ class TOC(object): return self.nodes[index] def autolayer(self): + """Make sequences of children pointing to the same content file into + children of the first node referencing that file. + """ prev = None for node in list(self.nodes): if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]: @@ -964,10 +1122,12 @@ class TOC(object): else: prev = node - def depth(self, level=0): - if self.nodes: - return self.nodes[0].depth(level+1) - return level + def depth(self): + """The maximum depth of the navigation tree rooted at this node.""" + try: + return max(node.depth() for node in self.nodes) + 1 + except ValueError: + return 1 def to_opf1(self, tour): for node in self.nodes: @@ -976,7 +1136,7 @@ class TOC(object): node.to_opf1(tour) return tour - def to_ncx(self, parent, depth=1): + def to_ncx(self, parent): for node in self.nodes: id = node.id or unicode(uuid.uuid4()) attrib = {'id': id, 'playOrder': '0'} @@ -985,19 +1145,40 @@ class TOC(object): point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) element(label, NCX('text')).text = node.title - href = node.href if depth > 1 else urldefrag(node.href)[0] - element(point, NCX('content'), src=href) - node.to_ncx(point, depth+1) + element(point, NCX('content'), src=node.href) + node.to_ncx(point) return parent class PageList(object): + """Collection of named "pages" to mapped positions within an OEB data model + book's textual content. + + Provides list-like access to the pages. + """ class Page(object): + """Represents a mapping between a page name and a position within + the book content. + + Provides the following instance data attributes: + + :attr:`name`: The name of this page. Generally a number. + :attr:`href`: Book-internal URL at which point this page begins. + :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly + labeled in print with small-case Roman numerals), 'normal' (for + standard pages, as commonly labeled in print with Arabic numerals), + or 'special' (for other pages, as commonly not labeled in any + fashion in print, such as the cover and title pages). + :attr:`klass`: Optional semantic class of this page. + :attr:`id`: Optional unique identifier for this page. + """ + TYPES = set(['front', 'normal', 'special']) + def __init__(self, name, href, type='normal', klass=None, id=None): - self.name = name + self.name = unicode(name) self.href = urlnormalize(href) - self.type = type + self.type = type if type in self.TYPES else 'normal' self.id = id self.klass = klass @@ -1005,6 +1186,7 @@ class PageList(object): self.pages = [] def add(self, name, href, type='normal', klass=None, id=None): + """Create a new page and add it to the `PageList`.""" page = self.Page(name, href, type, klass, id) self.pages.append(page) return page @@ -1018,6 +1200,12 @@ class PageList(object): def __getitem__(self, index): return self.pages[index] + + def pop(self, index=-1): + return self.pages.pop(index) + + def remove(self, page): + return self.pages.remove(page) def to_ncx(self, parent=None): plist = element(parent, NCX('pageList'), id=str(uuid.uuid4())) @@ -1043,501 +1231,61 @@ class PageList(object): class OEBBook(object): + """Representation of a book in the IDPF OEB data model.""" - COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') - COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') - - def __init__(self, opfpath=None, container=None, encoding=None, - logger=FauxLogger()): - if opfpath and not container: - container = DirContainer(os.path.dirname(opfpath)) - opfpath = os.path.basename(opfpath) - self.container = container + def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): + """Create empty book. Optional arguments: + + :param:`encoding`: Default encoding for textual content read + from an external container. + :param:`pretty_print`: Whether or not the canonical string form + of XML markup is pretty-printed. + :prama:`logger`: A Logger object to use for logging all messages + related to the processing of this book. It is accessible + via the instance data member :attr:`logger`. + + It provides the following public instance data members for + accessing various parts of the OEB data model: + + :attr:`metadata`: Metadata such as title, author name(s), etc. + :attr:`manifest`: Manifest of all files included in the book, + including MIME types and fallback information. + :attr:`spine`: In-order list of manifest items which compose + the textual content of the book. + :attr:`guide`: Collection of references to standard positions + within the text, such as the cover, preface, etc. + :attr:`toc`: Hierarchical table of contents. + :attr:`pages`: List of "pages," such as indexed to a print edition of + the same text. + """ self.encoding = encoding + self.pretty_print = pretty_print self.logger = logger - if opfpath or container: - opf = self._read_opf(opfpath) - self._all_from_opf(opf) - - def _clean_opf(self, opf): - nsmap = {} - for elem in opf.iter(tag=etree.Element): - nsmap.update(elem.nsmap) - for elem in opf.iter(tag=etree.Element): - if namespace(elem.tag) in ('', OPF1_NS): - elem.tag = OPF(barename(elem.tag)) - nsmap.update(Metadata.OPF2_NSMAP) - attrib = dict(opf.attrib) - nroot = etree.Element(OPF('package'), - nsmap={None: OPF2_NS}, attrib=attrib) - metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) - ignored = (OPF('dc-metadata'), OPF('x-metadata')) - for elem in xpath(opf, 'o2:metadata//*'): - if elem.tag in ignored: - continue - if namespace(elem.tag) in DC_NSES: - tag = barename(elem.tag).lower() - elem.tag = '{%s}%s' % (DC11_NS, tag) - metadata.append(elem) - for element in xpath(opf, 'o2:metadata//o2:meta'): - metadata.append(element) - for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): - for element in xpath(opf, tag): - nroot.append(element) - return nroot - - def _read_opf(self, opfpath): - data = self.container.read(opfpath) - data = self.decode(data) - data = XMLDECL_RE.sub('', data) - try: - opf = etree.fromstring(data) - except etree.XMLSyntaxError: - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) - opf = etree.fromstring(data) - self.logger.warn('OPF contains invalid HTML named entities') - ns = namespace(opf.tag) - if ns not in ('', OPF1_NS, OPF2_NS): - raise OEBError('Invalid namespace %r for OPF document' % ns) - opf = self._clean_opf(opf) - return opf - - def _metadata_from_opf(self, opf): - uid = opf.get('unique-identifier', None) + self.version = '2.0' + self.container = NullContainer() + self.metadata = Metadata(self) self.uid = None - self.metadata = metadata = Metadata(self) - for elem in xpath(opf, '/o2:package/o2:metadata//*'): - term = elem.tag - value = elem.text - attrib = dict(elem.attrib) - nsmap = elem.nsmap - if term == OPF('meta'): - term = qname(attrib.pop('name', None), nsmap) - value = attrib.pop('content', None) - if value: - value = COLLAPSE_RE.sub(' ', value.strip()) - if term and (value or attrib): - metadata.add(term, value, attrib, nsmap=nsmap) - haveuuid = haveid = False - for ident in metadata.identifier: - if unicode(ident).startswith('urn:uuid:'): - haveuuid = True - if 'id' in ident.attrib: - haveid = True - if not (haveuuid and haveid): - bookid = "urn:uuid:%s" % str(uuid.uuid4()) - metadata.add('identifier', bookid, id='calibre-uuid') - if uid is None: - self.logger.warn(u'Unique-identifier not specified') - for item in metadata.identifier: - if not item.id: - continue - if uid is None or item.id == uid: - self.uid = item - break - else: - self.logger.warn(u'Unique-identifier %r not found' % uid) - for ident in metadata.identifier: - if 'id' in ident.attrib: - self.uid = metadata.identifier[0] - break - if not metadata.language: - self.logger.warn(u'Language not specified') - metadata.add('language', get_lang()) - if not metadata.creator: - self.logger.warn('Creator not specified') - metadata.add('creator', self.translate(__('Unknown'))) - if not metadata.title: - self.logger.warn('Title not specified') - metadata.add('title', self.translate(__('Unknown'))) - - def _manifest_add_missing(self): - manifest = self.manifest - known = set(manifest.hrefs) - unchecked = set(manifest.values()) - while unchecked: - new = set() - for item in unchecked: - if (item.media_type in OEB_DOCS or - item.media_type[-4:] in ('/xml', '+xml')) and \ - item.data is not None: - hrefs = [sel(item.data) for sel in LINK_SELECTORS] - for href in chain(*hrefs): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data): - href, _ = urldefrag(match.group('url')) - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - unchecked.clear() - for href in new: - known.add(href) - if not self.container.exists(href): - self.logger.warn('Referenced file %r not found' % href) - continue - self.logger.warn('Referenced file %r not in manifest' % href) - id, _ = manifest.generate(id='added') - guessed = guess_type(href)[0] - media_type = guessed or BINARY_MIME - added = manifest.add(id, href, media_type) - unchecked.add(added) - - def _manifest_from_opf(self, opf): - self.manifest = manifest = Manifest(self) - for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): - id = elem.get('id') - href = elem.get('href') - media_type = elem.get('media-type', None) - if media_type is None: - media_type = elem.get('mediatype', None) - if media_type is None or media_type == 'text/xml': - guessed = guess_type(href)[0] - media_type = guessed or media_type or BINARY_MIME - fallback = elem.get('fallback') - if href in manifest.hrefs: - self.logger.warn(u'Duplicate manifest entry for %r' % href) - continue - if not self.container.exists(href): - self.logger.warn(u'Manifest item %r not found' % href) - continue - if id in manifest.ids: - self.logger.warn(u'Duplicate manifest id %r' % id) - id, href = manifest.generate(id, href) - manifest.add(id, href, media_type, fallback) - self._manifest_add_missing() - - def _spine_add_extra(self): - manifest = self.manifest - spine = self.spine - unchecked = set(spine) - selector = XPath('h:body//h:a/@href') - extras = set() - while unchecked: - new = set() - for item in unchecked: - if item.media_type not in OEB_DOCS: - # TODO: handle fallback chains - continue - for href in selector(item.data): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - if href not in manifest.hrefs: - continue - found = manifest.hrefs[href] - if found.media_type not in OEB_DOCS or \ - found in spine or found in extras: - continue - new.add(found) - extras.update(new) - unchecked = new - version = int(self.version[0]) - for item in sorted(extras): - if version >= 2: - self.logger.warn( - 'Spine-referenced file %r not in spine' % item.href) - spine.add(item, linear=False) - - def _spine_from_opf(self, opf): - self.spine = spine = Spine(self) - for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): - idref = elem.get('idref') - if idref not in self.manifest: - self.logger.warn(u'Spine item %r not found' % idref) - continue - item = self.manifest[idref] - spine.add(item, elem.get('linear')) - if len(spine) == 0: - raise OEBError("Spine is empty") - self._spine_add_extra() - - def _guide_from_opf(self, opf): - self.guide = guide = Guide(self) - for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): - href = elem.get('href') - path = urldefrag(href)[0] - if path not in self.manifest.hrefs: - self.logger.warn(u'Guide reference %r not found' % href) - continue - guide.add(elem.get('type'), elem.get('title'), href) - - def _find_ncx(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@toc') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == NCX_MIME: - self.manifest.remove(item) - return item - return None - - def _toc_from_navpoint(self, item, toc, navpoint): - children = xpath(navpoint, 'ncx:navPoint') - for child in children: - title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - href = xpath(child, 'ncx:content/@src') - if not title or not href: - continue - href = item.abshref(urlnormalize(href[0])) - path, _ = urldefrag(href) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = child.get('id') - klass = child.get('class') - node = toc.add(title, href, id=id, klass=klass) - self._toc_from_navpoint(item, node, child) - - def _toc_from_ncx(self, item): - if item is None: - return False - ncx = item.data - title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - title = title or unicode(self.metadata.title[0]) - self.toc = toc = TOC(title) - navmaps = xpath(ncx, 'ncx:navMap') - for navmap in navmaps: - self._toc_from_navpoint(item, toc, navmap) - return True - - def _toc_from_tour(self, opf): - result = xpath(opf, 'o2:tours/o2:tour') - if not result: - return False - tour = result[0] - self.toc = toc = TOC(tour.get('title')) - sites = xpath(tour, 'o2:site') - for site in sites: - title = site.get('title') - href = site.get('href') - if not title or not href: - continue - path, _ = urldefrag(urlnormalize(href)) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = site.get('id') - toc.add(title, href, id=id) - return True - - def _toc_from_html(self, opf): - if 'toc' not in self.guide: - return False - self.toc = toc = TOC() - itempath, frag = urldefrag(self.guide['toc'].href) - item = self.manifest.hrefs[itempath] - html = item.data - if frag: - elems = xpath(html, './/*[@id="%s"]' % frag) - if not elems: - elems = xpath(html, './/*[@name="%s"]' % frag) - elem = elems[0] if elems else html - while elem != html and not xpath(elem, './/h:a[@href]'): - elem = elem.getparent() - html = elem - titles = defaultdict(list) - order = [] - for anchor in xpath(html, './/h:a[@href]'): - href = anchor.attrib['href'] - href = item.abshref(urlnormalize(href)) - path, frag = urldefrag(href) - if path not in self.manifest.hrefs: - continue - title = ' '.join(xpath(anchor, './/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if href not in titles: - order.append(href) - titles[href].append(title) - for href in order: - toc.add(' '.join(titles[href]), href) - return True - - def _toc_from_spine(self, opf): - self.toc = toc = TOC() - titles = [] - headers = [] - for item in self.spine: - if not item.linear: continue - html = item.data - title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if title: - titles.append(title) - headers.append('(unlabled)') - for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): - expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html, expr % tag)) - header = COLLAPSE_RE.sub(' ', header.strip()) - if header: - headers[-1] = header - break - use = titles - if len(titles) > len(set(titles)): - use = headers - for title, item in izip(use, self.spine): - if not item.linear: continue - toc.add(title, item.href) - return True - - def _toc_from_opf(self, opf, item): - if self._toc_from_ncx(item): return - if self._toc_from_tour(opf): return - self.logger.warn('No metadata table of contents found') - if self._toc_from_html(opf): return - self._toc_from_spine(opf) - - def _pages_from_ncx(self, opf, item): - if item is None: - return False - ncx = item.data - ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') - if not ptargets: - return False - pages = self.pages = PageList() - for ptarget in ptargets: - name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) - name = COLLAPSE_RE.sub(' ', name.strip()) - href = xpath(ptarget, 'ncx:content/@src') - if not href: - continue - href = item.abshref(urlnormalize(href[0])) - id = ptarget.get('id') - type = ptarget.get('type', 'normal') - klass = ptarget.get('class') - pages.add(name, href, type=type, id=id, klass=klass) - return True - - def _find_page_map(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@page-map') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == PAGE_MAP_MIME: - self.manifest.remove(item) - return item - return None - - def _pages_from_page_map(self, opf): - item = self._find_page_map(opf) - if item is None: - return False - pmap = item.data - pages = self.pages = PageList() - for page in xpath(pmap, 'o2:page'): - name = page.get('name', '') - href = page.get('href') - if not href: - continue - name = COLLAPSE_RE.sub(' ', name.strip()) - href = item.abshref(urlnormalize(href)) - type = 'normal' - if not name: - type = 'special' - elif name.lower().strip('ivxlcdm') == '': - type = 'front' - pages.add(name, href, type=type) - return True - - def _pages_from_opf(self, opf, item): - if self._pages_from_ncx(opf, item): return - if self._pages_from_page_map(opf): return + self.manifest = Manifest(self) + self.spine = Spine(self) + self.guide = Guide(self) + self.toc = TOC() self.pages = PageList() - return - - def _cover_from_html(self, hcover): - with TemporaryDirectory('_html_cover') as tdir: - writer = DirWriter() - writer.dump(self, tdir) - path = os.path.join(tdir, urlunquote(hcover.href)) - renderer = CoverRenderer(path) - data = renderer.image_data - id, href = self.manifest.generate('cover', 'cover.jpeg') - item = self.manifest.add(id, href, JPEG_MIME, data=data) - return item - - def _locate_cover_image(self): - if self.metadata.cover: - id = str(self.metadata.cover[0]) - item = self.manifest.ids.get(id, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - else: - self.logger.warn('Invalid cover image @id %r' % id) - hcover = self.spine[0] - if 'cover' in self.guide: - href = self.guide['cover'].href - item = self.manifest.hrefs[href] - media_type = item.media_type - if media_type in OEB_IMAGES: - return item - elif media_type in OEB_DOCS: - hcover = item - html = hcover.data - if MS_COVER_TYPE in self.guide: - href = self.guide[MS_COVER_TYPE].href - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - if self.COVER_SVG_XP(html): - svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) - href = os.path.splitext(hcover.href)[0] + '.svg' - id, href = self.manifest.generate(hcover.id, href) - item = self.manifest.add(id, href, SVG_MIME, data=svg) - return item - if self.COVER_OBJECT_XP(html): - object = self.COVER_OBJECT_XP(html)[0] - href = hcover.abshref(object.get('data')) - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - return self._cover_from_html(hcover) - - def _ensure_cover_image(self): - cover = self._locate_cover_image() - if self.metadata.cover: - self.metadata.cover[0].value = cover.id - return - self.metadata.add('cover', cover.id) - - def _all_from_opf(self, opf): - self.version = opf.get('version', '1.2') - self._metadata_from_opf(opf) - self._manifest_from_opf(opf) - self._spine_from_opf(opf) - self._guide_from_opf(opf) - item = self._find_ncx(opf) - self._toc_from_opf(opf, item) - self._pages_from_opf(opf, item) - self._ensure_cover_image() + + @classmethod + def generate(cls, opts): + """Generate an OEBBook instance from command-line options.""" + encoding = opts.encoding + pretty_print = opts.pretty_print + return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): + """Translate :param:`text` into the book's primary language.""" lang = str(self.metadata.language[0]) lang = lang.split('-', 1)[0].lower() return translate(lang, text) def decode(self, data): + """Automatically decode :param:`data` into a `unicode` object.""" if isinstance(data, unicode): return data if data[:2] in ('\xff\xfe', '\xfe\xff'): @@ -1560,6 +1308,11 @@ class OEBBook(object): return data def to_opf1(self): + """Produce OPF 1.2 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ package = etree.Element('package', attrib={'unique-identifier': self.uid.id}) self.metadata.to_opf1(package) @@ -1631,6 +1384,11 @@ class OEBBook(object): return ncx def to_opf2(self, page_map=False): + """Produce OPF 2.0 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ results = {} package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': self.uid.id}, @@ -1652,16 +1410,3 @@ class OEBBook(object): spine.attrib['page-map'] = id results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) return results - - -def main(argv=sys.argv): - for arg in argv[1:]: - oeb = OEBBook(arg) - for name, doc in oeb.to_opf1().values(): - print etree.tostring(doc, pretty_print=True) - for name, doc in oeb.to_opf2(page_map=True).values(): - print etree.tostring(doc, pretty_print=True) - return 0 - -if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py new file mode 100644 index 0000000000..684451044b --- /dev/null +++ b/src/calibre/ebooks/oeb/factory.py @@ -0,0 +1,98 @@ +''' +Registry associating file extensions with Reader classes. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, logging +from itertools import chain +from calibre.ebooks.oeb.base import OEBError +from calibre.ebooks.oeb.reader import OEBReader +from calibre.ebooks.oeb.writer import OEBWriter +from calibre.ebooks.lit.reader import LitReader +from calibre.ebooks.lit.writer import LitWriter +from calibre.ebooks.mobi.reader import MobiReader +from calibre.ebooks.mobi.writer import MobiWriter +from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.profile import Context +from calibre.utils.config import Config + +__all__ = ['get_reader'] + +REGISTRY = { + '.opf': (OEBReader, None), + '.lit': (LitReader, LitWriter), + '.mobi': (MobiReader, MobiWriter), + } + +def ReaderFactory(path): + if os.path.isdir(path): + return OEBReader + ext = os.path.splitext(path)[1].lower() + Reader = REGISTRY.get(ext, (None, None))[0] + if Reader is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Reader + +def WriterFactory(path): + if os.path.isdir(path): + return OEBWriter + ext = os.path.splitext(path)[1].lower() + if not os.path.exists(path) and not ext: + return OEBWriter + Writer = REGISTRY.get(ext, (None, None))[1] + if Writer is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Writer + + +def option_parser(Reader, Writer): + cfg = Config('ebook-convert', _('Options to control e-book conversion.')) + Reader.config(cfg) + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + Transform.config(cfg) + Writer.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for input. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def main(argv=sys.argv): + if len(argv) < 3: + print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]") + return 1 + inpath, outpath = argv[1], argv[2] + Reader = ReaderFactory(inpath) + Writer = WriterFactory(outpath) + parser = option_parser(Reader, Writer) + opts, args = parser.parse_args(argv[3:]) + if len(args) != 0: + parser.print_help() + return 1 + logger = Logger(logging.getLogger('ebook-convert')) + logger.setup_cli_handler(opts.verbose) + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE) + reader = Reader.generate(opts) + writer = Writer.generate(opts) + transforms = [] + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + transforms.append(Transform.generate(opts)) + reader(oeb, inpath) + for transform in transforms: + transform(oeb, context) + writer(oeb, outpath) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py new file mode 100644 index 0000000000..0fce1c2b0d --- /dev/null +++ b/src/calibre/ebooks/oeb/reader.py @@ -0,0 +1,562 @@ +""" +Container-/OPF-based input OEBBook reader. +""" +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, uuid, copy +from itertools import izip, chain +from urlparse import urldefrag, urlparse +from urllib import unquote as urlunquote +from mimetypes import guess_type +from collections import defaultdict +from lxml import etree +from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ + DC_NSES, OPF +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ + PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME +from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ + ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE +from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath +from calibre.ebooks.oeb.base import urlnormalize, xml2str +from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer +from calibre.ebooks.oeb.writer import OEBWriter +from calibre.ebooks.oeb.entitydefs import ENTITYDEFS +from calibre.ebooks.metadata.epub import CoverRenderer +from calibre.startup import get_lang +from calibre.ptempfile import TemporaryDirectory + +__all__ = ['OEBReader'] + +class OEBReader(object): + """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" + + COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') + COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') + + Container = DirContainer + """Container type used to access book files. Override in sub-classes.""" + + DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content read with this Reader.""" + + TRANSFORMS = [] + """List of transforms to apply to content read with this Reader.""" + + def __init__(self): + return + + @classmethod + def config(cls, cfg): + """Add any book-reading options to the :class:`Config` object + :param:`cfg`. + """ + return + + @classmethod + def generate(cls, opts): + """Generate a Reader instance from command-line options.""" + return cls() + + def __call__(self, oeb, path): + """Read the book at :param:`path` into the :class:`OEBBook` object + :param:`oeb`. + """ + self.oeb = oeb + self.logger = oeb.logger + oeb.container = self.Container(path) + opf = self._read_opf() + self._all_from_opf(opf) + return oeb + + def _clean_opf(self, opf): + nsmap = {} + for elem in opf.iter(tag=etree.Element): + nsmap.update(elem.nsmap) + for elem in opf.iter(tag=etree.Element): + if namespace(elem.tag) in ('', OPF1_NS): + elem.tag = OPF(barename(elem.tag)) + nsmap.update(OPF2_NSMAP) + attrib = dict(opf.attrib) + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, attrib=attrib) + metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) + ignored = (OPF('dc-metadata'), OPF('x-metadata')) + for elem in xpath(opf, 'o2:metadata//*'): + if elem.tag in ignored: + continue + if namespace(elem.tag) in DC_NSES: + tag = barename(elem.tag).lower() + elem.tag = '{%s}%s' % (DC11_NS, tag) + metadata.append(elem) + for element in xpath(opf, 'o2:metadata//o2:meta'): + metadata.append(element) + for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): + for element in xpath(opf, tag): + nroot.append(element) + return nroot + + def _read_opf(self): + data = self.oeb.container.read(None) + data = self.oeb.decode(data) + data = XMLDECL_RE.sub('', data) + try: + opf = etree.fromstring(data) + except etree.XMLSyntaxError: + repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) + data = ENTITY_RE.sub(repl, data) + opf = etree.fromstring(data) + self.logger.warn('OPF contains invalid HTML named entities') + ns = namespace(opf.tag) + if ns not in ('', OPF1_NS, OPF2_NS): + raise OEBError('Invalid namespace %r for OPF document' % ns) + opf = self._clean_opf(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.get('unique-identifier', None) + self.oeb.uid = None + metadata = self.oeb.metadata + for elem in xpath(opf, '/o2:package/o2:metadata//*'): + term = elem.tag + value = elem.text + attrib = dict(elem.attrib) + nsmap = elem.nsmap + if term == OPF('meta'): + term = qname(attrib.pop('name', None), nsmap) + value = attrib.pop('content', None) + if value: + value = COLLAPSE_RE.sub(' ', value.strip()) + if term and (value or attrib): + metadata.add(term, value, attrib, nsmap=nsmap) + haveuuid = haveid = False + for ident in metadata.identifier: + if unicode(ident).startswith('urn:uuid:'): + haveuuid = True + if 'id' in ident.attrib: + haveid = True + if not (haveuuid and haveid): + bookid = "urn:uuid:%s" % str(uuid.uuid4()) + metadata.add('identifier', bookid, id='calibre-uuid') + if uid is None: + self.logger.warn(u'Unique-identifier not specified') + for item in metadata.identifier: + if not item.id: + continue + if uid is None or item.id == uid: + self.oeb.uid = item + break + else: + self.logger.warn(u'Unique-identifier %r not found' % uid) + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.oeb.uid = metadata.identifier[0] + break + if not metadata.language: + self.logger.warn(u'Language not specified') + metadata.add('language', get_lang()) + if not metadata.creator: + self.logger.warn('Creator not specified') + metadata.add('creator', self.oeb.translate(__('Unknown'))) + if not metadata.title: + self.logger.warn('Title not specified') + metadata.add('title', self.oeb.translate(__('Unknown'))) + + def _manifest_add_missing(self): + manifest = self.oeb.manifest + known = set(manifest.hrefs) + unchecked = set(manifest.values()) + while unchecked: + new = set() + for item in unchecked: + if (item.media_type in OEB_DOCS or + item.media_type[-4:] in ('/xml', '+xml')) and \ + item.data is not None: + hrefs = [sel(item.data) for sel in LINK_SELECTORS] + for href in chain(*hrefs): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + elif item.media_type in OEB_STYLES: + for match in CSSURL_RE.finditer(item.data): + href, _ = urldefrag(match.group('url')) + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + unchecked.clear() + for href in new: + known.add(href) + if not self.oeb.container.exists(href): + self.logger.warn('Referenced file %r not found' % href) + continue + self.logger.warn('Referenced file %r not in manifest' % href) + id, _ = manifest.generate(id='added') + guessed = guess_type(href)[0] + media_type = guessed or BINARY_MIME + added = manifest.add(id, href, media_type) + unchecked.add(added) + + def _manifest_from_opf(self, opf): + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + id = elem.get('id') + href = elem.get('href') + media_type = elem.get('media-type', None) + if media_type is None: + media_type = elem.get('mediatype', None) + if media_type is None or media_type == 'text/xml': + guessed = guess_type(href)[0] + media_type = guessed or media_type or BINARY_MIME + fallback = elem.get('fallback') + if href in manifest.hrefs: + self.logger.warn(u'Duplicate manifest entry for %r' % href) + continue + if not self.oeb.container.exists(href): + self.logger.warn(u'Manifest item %r not found' % href) + continue + if id in manifest.ids: + self.logger.warn(u'Duplicate manifest id %r' % id) + id, href = manifest.generate(id, href) + manifest.add(id, href, media_type, fallback) + self._manifest_add_missing() + + def _spine_add_extra(self): + manifest = self.oeb.manifest + spine = self.oeb.spine + unchecked = set(spine) + selector = XPath('h:body//h:a/@href') + extras = set() + while unchecked: + new = set() + for item in unchecked: + if item.media_type not in OEB_DOCS: + # TODO: handle fallback chains + continue + for href in selector(item.data): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + if href not in manifest.hrefs: + continue + found = manifest.hrefs[href] + if found.media_type not in OEB_DOCS or \ + found in spine or found in extras: + continue + new.add(found) + extras.update(new) + unchecked = new + version = int(self.oeb.version[0]) + for item in sorted(extras): + if version >= 2: + self.logger.warn( + 'Spine-referenced file %r not in spine' % item.href) + spine.add(item, linear=False) + + def _spine_from_opf(self, opf): + spine = self.oeb.spine + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + idref = elem.get('idref') + if idref not in manifest.ids: + self.logger.warn(u'Spine item %r not found' % idref) + continue + item = manifest.ids[idref] + spine.add(item, elem.get('linear')) + if len(spine) == 0: + raise OEBError("Spine is empty") + self._spine_add_extra() + + def _guide_from_opf(self, opf): + guide = self.oeb.guide + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + href = elem.get('href') + path = urldefrag(href)[0] + if path not in manifest.hrefs: + self.logger.warn(u'Guide reference %r not found' % href) + continue + guide.add(elem.get('type'), elem.get('title'), href) + + def _find_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == NCX_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _toc_from_navpoint(self, item, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + href = xpath(child, 'ncx:content/@src') + if not title or not href: + continue + href = item.abshref(urlnormalize(href[0])) + path, _ = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(item, node, child) + + def _toc_from_ncx(self, item): + if item is None: + return False + ncx = item.data + title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + title = title or unicode(self.oeb.metadata.title[0]) + toc = self.oeb.toc + toc.title = title + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(item, toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, 'o2:tours/o2:tour') + if not result: + return False + tour = result[0] + toc = self.oeb.toc + toc.title = tour.get('title') + sites = xpath(tour, 'o2:site') + for site in sites: + title = site.get('title') + href = site.get('href') + if not title or not href: + continue + path, _ = urldefrag(urlnormalize(href)) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = site.get('id') + toc.add(title, href, id=id) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.oeb.guide: + return False + itempath, frag = urldefrag(self.oeb.guide['toc'].href) + item = self.oeb.manifest.hrefs[itempath] + html = item.data + if frag: + elems = xpath(html, './/*[@id="%s"]' % frag) + if not elems: + elems = xpath(html, './/*[@name="%s"]' % frag) + elem = elems[0] if elems else html + while elem != html and not xpath(elem, './/h:a[@href]'): + elem = elem.getparent() + html = elem + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + href = item.abshref(urlnormalize(href)) + path, frag = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + continue + title = ' '.join(xpath(anchor, './/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if href not in titles: + order.append(href) + titles[href].append(title) + toc = self.oeb.toc + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + toc = self.oeb.toc + titles = [] + headers = [] + for item in self.oeb.spine: + if not item.linear: continue + html = item.data + title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if title: + titles.append(title) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' + header = ''.join(xpath(html, expr % tag)) + header = COLLAPSE_RE.sub(' ', header.strip()) + if header: + headers[-1] = header + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.oeb.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf, item): + if self._toc_from_ncx(item): return + if self._toc_from_tour(opf): return + self.logger.warn('No metadata table of contents found') + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _pages_from_ncx(self, opf, item): + if item is None: + return False + ncx = item.data + ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') + if not ptargets: + return False + pages = self.oeb.pages + for ptarget in ptargets: + name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) + name = COLLAPSE_RE.sub(' ', name.strip()) + href = xpath(ptarget, 'ncx:content/@src') + if not href: + continue + href = item.abshref(urlnormalize(href[0])) + id = ptarget.get('id') + type = ptarget.get('type', 'normal') + klass = ptarget.get('class') + pages.add(name, href, type=type, id=id, klass=klass) + return True + + def _find_page_map(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@page-map') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == PAGE_MAP_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _pages_from_page_map(self, opf): + item = self._find_page_map(opf) + if item is None: + return False + pmap = item.data + pages = self.oeb.pages + for page in xpath(pmap, 'o2:page'): + name = page.get('name', '') + href = page.get('href') + if not href: + continue + name = COLLAPSE_RE.sub(' ', name.strip()) + href = item.abshref(urlnormalize(href)) + type = 'normal' + if not name: + type = 'special' + elif name.lower().strip('ivxlcdm') == '': + type = 'front' + pages.add(name, href, type=type) + return True + + def _pages_from_opf(self, opf, item): + if self._pages_from_ncx(opf, item): return + if self._pages_from_page_map(opf): return + return + + def _cover_from_html(self, hcover): + with TemporaryDirectory('_html_cover') as tdir: + writer = OEBWriter() + writer(self.oeb, tdir) + path = os.path.join(tdir, urlunquote(hcover.href)) + renderer = CoverRenderer(path) + data = renderer.image_data + id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') + item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) + return item + + def _locate_cover_image(self): + if self.oeb.metadata.cover: + id = str(self.oeb.metadata.cover[0]) + item = self.oeb.manifest.ids.get(id, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + else: + self.logger.warn('Invalid cover image @id %r' % id) + hcover = self.oeb.spine[0] + if 'cover' in self.oeb.guide: + href = self.oeb.guide['cover'].href + item = self.oeb.manifest.hrefs[href] + media_type = item.media_type + if media_type in OEB_IMAGES: + return item + elif media_type in OEB_DOCS: + hcover = item + html = hcover.data + if MS_COVER_TYPE in self.oeb.guide: + href = self.oeb.guide[MS_COVER_TYPE].href + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + if self.COVER_SVG_XP(html): + svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) + href = os.path.splitext(hcover.href)[0] + '.svg' + id, href = self.oeb.manifest.generate(hcover.id, href) + item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) + return item + if self.COVER_OBJECT_XP(html): + object = self.COVER_OBJECT_XP(html)[0] + href = hcover.abshref(object.get('data')) + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + return self._cover_from_html(hcover) + + def _ensure_cover_image(self): + cover = self._locate_cover_image() + if self.oeb.metadata.cover: + self.oeb.metadata.cover[0].value = cover.id + return + self.oeb.metadata.add('cover', cover.id) + + def _all_from_opf(self, opf): + self.oeb.version = opf.get('version', '1.2') + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + item = self._find_ncx(opf) + self._toc_from_opf(opf, item) + self._pages_from_opf(opf, item) + self._ensure_cover_image() + + +def main(argv=sys.argv): + reader = OEBReader() + for arg in argv[1:]: + oeb = reader(OEBBook(), arg) + for name, doc in oeb.to_opf1().values(): + print etree.tostring(doc, pretty_print=True) + for name, doc in oeb.to_opf2(page_map=True).values(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 01afcb08e2..ac9684a624 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -94,7 +94,15 @@ class CSSFlattener(object): self.unfloat = unfloat self.untable = untable - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.oeb = oeb self.context = context diff --git a/src/calibre/ebooks/oeb/transforms/htmltoc.py b/src/calibre/ebooks/oeb/transforms/htmltoc.py index 5508b58ec3..0040f39c14 100644 --- a/src/calibre/ebooks/oeb/transforms/htmltoc.py +++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py @@ -52,7 +52,18 @@ class HTMLTOCAdder(object): self.title = title self.style = style - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + group = cfg.add_group('htmltoc', _('HTML TOC generation options.')) + group('toc_title', ['--toc-title'], default=None, + help=_('Title for any generated in-line table of contents.')) + return cfg + + @classmethod + def generate(cls, opts): + return cls(title=opts.toc_title) + + def __call__(self, oeb, context): if 'toc' in oeb.guide: return oeb.logger.info('Generating in-line TOC...') diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index 3a3d91364f..c819475a4d 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -29,7 +29,15 @@ CASE_MANGLER_CSS = """ TEXT_TRANSFORMS = set(['capitalize', 'uppercase', 'lowercase']) class CaseMangler(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Applying case-transforming CSS...') self.oeb = oeb self.profile = context.source diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 12a2812898..aef5c2c98b 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -34,7 +34,15 @@ class SVGRasterizer(object): if QApplication.instance() is None: QApplication([]) - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb self.profile = context.dest diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index c2ff5ab671..c731800999 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -13,7 +13,15 @@ from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE from calibre.ebooks.oeb.base import urlnormalize class ManifestTrimmer(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Trimming unused files from manifest...') used = set() hrefs = oeb.manifest.hrefs diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py new file mode 100644 index 0000000000..235965b50f --- /dev/null +++ b/src/calibre/ebooks/oeb/writer.py @@ -0,0 +1,75 @@ +''' +Directory output OEBBook writer. +''' +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, logging +from calibre.ebooks.oeb.base import OPF_MIME, xml2str +from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook + +__all__ = ['OEBWriter'] + +class OEBWriter(object): + DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content written with this Writer.""" + + TRANSFORMS = [] + """List of transforms to apply to content written with this Writer.""" + + def __init__(self, version='2.0', page_map=False, pretty_print=False): + self.version = version + self.page_map = page_map + self.pretty_print = pretty_print + + @classmethod + def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ + oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) + versions = ['1.2', '2.0'] + oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, + help=_('OPF version to generate. Default is %default.')) + oeb('adobe_page_map', ['--adobe-page-map'], default=False, + help=_('Generate an Adobe "page-map" file if pagination ' + 'information is avaliable.')) + return cfg + + @classmethod + def generate(cls, opts): + """Generate a Writer instance from command-line options.""" + version = opts.opf_version + page_map = opts.adobe_page_map + pretty_print = opts.pretty_print + return cls(version=version, page_map=page_map, + pretty_print=pretty_print) + + def __call__(self, oeb, path): + """Read the book in the :class:`OEBBook` object :param:`oeb` to a file + at :param:`path`. + """ + version = int(self.version[0]) + opfname = None + if os.path.splitext(path)[1].lower() == '.opf': + opfname = os.path.basename(path) + path = os.path.dirname(path) + if not os.path.isdir(path): + os.mkdir(path) + output = DirContainer(path) + for item in oeb.manifest.values(): + output.write(item.href, str(item)) + if version == 1: + metadata = oeb.to_opf1() + elif version == 2: + metadata = oeb.to_opf2(page_map=self.page_map) + else: + raise OEBError("Unrecognized OPF version %r" % self.version) + pretty_print = self.pretty_print + for mime, (href, data) in metadata.items(): + if opfname and mime == OPF_MIME: + href = opfname + output.write(href, xml2str(data, pretty_print=pretty_print)) + return