From bd296fa43c8d7338b65af1c5ca7cfb02fc9c6daf Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 17 Jan 2009 11:18:14 -0500 Subject: [PATCH 1/7] Restore LitReader refactoring (again) --- src/calibre/ebooks/lit/reader.py | 363 +++++++++++++++++-------------- 1 file changed, 201 insertions(+), 162 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 461c067382..0e7f9a1ccf 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,20 +7,24 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, cStringIO, os +import sys, struct, os import functools import re from urlparse import urldefrag +from cStringIO import StringIO +from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 -from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.oeb.base import XML_PARSER, urlnormalize from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] msdes, msdeserror = plugins['msdes'] +__all__ = ["LitReader"] + XML_DECL = """ """ OPF_DECL = """ @@ -108,6 +112,9 @@ def consume_sized_utf8_string(bytes, zpad=False): pos += 1 return u''.join(result), bytes[pos:] +def encode(string): + return unicode(string).encode('ascii', 'xmlcharrefreplace') + class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') @@ -118,13 +125,13 @@ class UnBinary(object): def __init__(self, bin, path, manifest={}, map=HTML_MAP): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map - self.opf = map is OPF_MAP - self.bin = bin + self.is_html = map is HTML_MAP self.dir = os.path.dirname(path) - self.buf = cStringIO.StringIO() - self.binary_to_text() - self.raw = self.buf.getvalue().lstrip().decode('utf-8') + buf = StringIO() + self.binary_to_text(bin, buf) + self.raw = buf.getvalue().lstrip() self.escape_reserved() + self._tree = None def escape_reserved(self): raw = self.raw @@ -151,18 +158,28 @@ class UnBinary(object): return '/'.join(relpath) def __unicode__(self): + return self.raw.decode('utf-8') + + def __str__(self): return self.raw + + def tree(): + def fget(self): + if not self._tree: + self._tree = etree.fromstring(self.raw, parser=XML_PARSER) + return self._tree + return property(fget=fget) + tree = tree() - def binary_to_text(self, base=0, depth=0): + def binary_to_text(self, bin, buf, index=0, depth=0): tag_name = current_map = None dynamic_tag = errors = 0 in_censorship = is_goingdown = False state = 'text' - index = base flags = 0 - while index < len(self.bin): - c, index = read_utf8_char(self.bin, index) + while index < len(bin): + c, index = read_utf8_char(bin, index) oc = ord(c) if state == 'text': @@ -175,7 +192,7 @@ class UnBinary(object): c = '>>' elif c == '<': c = '<<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) elif state == 'get flags': if oc == 0: @@ -188,7 +205,7 @@ class UnBinary(object): state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc - self.buf.write('<') + buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: @@ -205,7 +222,7 @@ class UnBinary(object): tag_name = '?'+unichr(tag)+'?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag') @@ -217,15 +234,14 @@ class UnBinary(object): if not is_goingdown: tag_name = None dynamic_tag = 0 - self.buf.write(' />') + buf.write(' />') else: - self.buf.write('>') - index = self.binary_to_text(base=index, depth=depth+1) + buf.write('>') + index = self.binary_to_text(bin, buf, index, depth+1) is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write(u''.join( - ('')).encode('utf-8')) + buf.write(encode(u''.join(('')))) dynamic_tag = 0 tag_name = None state = 'text' @@ -245,7 +261,7 @@ class UnBinary(object): in_censorship = True state = 'get value length' continue - self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') + buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: @@ -253,40 +269,39 @@ class UnBinary(object): elif state == 'get value length': if not in_censorship: - self.buf.write('"') + buf.write('"') count = oc - 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue - if count < 0 or count > (len(self.bin) - index): + if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: - self.buf.write('%s"' % (oc - 1)) + buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(c.encode( - 'ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) count -= 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 - if count <= 0 or count > len(self.bin)-index: + if count <= 0 or count > len(bin)-index: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' @@ -296,26 +311,26 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - self.buf.write(' ') + buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': - self.buf.write(unicode(c).encode('utf-8')) + buf.write(encode(c)) count -= 1 if count == 0: - self.buf.write('=') + buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' @@ -329,10 +344,11 @@ class UnBinary(object): if frag: path = '#'.join((path, frag)) path = urlnormalize(path) - self.buf.write((u'"%s"' % path).encode('utf-8')) + buf.write(encode(u'"%s"' % path)) state = 'get attr' return index + class DirectoryEntry(object): def __init__(self, name, section, offset, size): self.name = name @@ -347,6 +363,7 @@ class DirectoryEntry(object): def __str__(self): return repr(self) + class ManifestItem(object): def __init__(self, original, internal, mime_type, offset, root, state): self.original = original @@ -374,65 +391,87 @@ class ManifestItem(object): % (self.internal, self.path, self.mime_type, self.offset, self.root, self.state) + def preserve(function): def wrapper(self, *args, **kwargs): - opos = self._stream.tell() + opos = self.stream.tell() try: return function(self, *args, **kwargs) finally: - self._stream.seek(opos) + self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper -class LitReader(object): +class LitFile(object): PIECE_SIZE = 16 - XML_PARSER = etree.XMLParser( - recover=True, resolve_entities=False) + + def __init__(self, filename_or_stream): + if hasattr(filename_or_stream, 'read'): + self.stream = filename_or_stream + else: + self.stream = open(filename_or_stream, 'rb') + try: + self.opf_path = os.path.splitext( + os.path.basename(self.stream.name))[0] + '.opf' + except AttributeError: + self.opf_path = 'content.opf' + if self.magic != 'ITOLITLS': + raise LitError('Not a valid LIT file') + if self.version != 1: + raise LitError('Unknown LIT version %d' % (self.version,)) + self.read_secondary_header() + self.read_header_pieces() + self.read_section_names() + self.read_manifest() + self.read_drm() + + def warn(self, msg): + print "WARNING: %s" % (msg,) def magic(): @preserve def fget(self): - self._stream.seek(0) - return self._stream.read(8) + self.stream.seek(0) + return self.stream.read(8) return property(fget=fget) magic = magic() def version(): def fget(self): - self._stream.seek(8) - return u32(self._stream.read(4)) + self.stream.seek(8) + return u32(self.stream.read(4)) return property(fget=fget) version = version() def hdr_len(): @preserve def fget(self): - self._stream.seek(12) - return int32(self._stream.read(4)) + self.stream.seek(12) + return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() def num_pieces(): @preserve def fget(self): - self._stream.seek(16) - return int32(self._stream.read(4)) + self.stream.seek(16) + return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() def sec_hdr_len(): @preserve def fget(self): - self._stream.seek(20) - return int32(self._stream.read(4)) + self.stream.seek(20) + return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() def guid(): @preserve def fget(self): - self._stream.seek(24) - return self._stream.read(16) + self.stream.seek(24) + return self.stream.read(16) return property(fget=fget) guid = guid() @@ -442,44 +481,27 @@ class LitReader(object): size = self.hdr_len \ + (self.num_pieces * self.PIECE_SIZE) \ + self.sec_hdr_len - self._stream.seek(0) - return self._stream.read(size) + self.stream.seek(0) + return self.stream.read(size) return property(fget=fget) header = header() - def __init__(self, filename_or_stream): - if hasattr(filename_or_stream, 'read'): - self._stream = filename_or_stream - else: - self._stream = open(filename_or_stream, 'rb') - if self.magic != 'ITOLITLS': - raise LitError('Not a valid LIT file') - if self.version != 1: - raise LitError('Unknown LIT version %d' % (self.version,)) - self.entries = {} - self._read_secondary_header() - self._read_header_pieces() - self._read_section_names() - self._read_manifest() - self._read_meta() - self._read_drm() - @preserve def __len__(self): - self._stream.seek(0, 2) - return self._stream.tell() + self.stream.seek(0, 2) + return self.stream.tell() @preserve - def _read_raw(self, offset, size): - self._stream.seek(offset) - return self._stream.read(size) + def read_raw(self, offset, size): + self.stream.seek(offset) + return self.stream.read(size) - def _read_content(self, offset, size): - return self._read_raw(self.content_offset + offset, size) + def read_content(self, offset, size): + return self.read_raw(self.content_offset + offset, size) - def _read_secondary_header(self): + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) - bytes = self._read_raw(offset, self.sec_hdr_len) + bytes = self.read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset+4] @@ -507,21 +529,21 @@ class LitReader(object): if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - def _read_header_pieces(self): + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) - piece = self._read_raw(offset, size) + piece = self.read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') - self._read_directory(piece) + self.read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: @@ -532,12 +554,13 @@ class LitReader(object): elif i == 4: self.piece4_guid = piece - def _read_directory(self, piece): + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) if (32 + (num_chunks * chunk_size)) != len(piece): - raise LitError('IFCM HEADER has incorrect length') + raise LitError('IFCM header has incorrect length') + self.entries = {} for i in xrange(num_chunks): offset = 32 + (i * chunk_size) chunk = piece[offset:offset + chunk_size] @@ -571,17 +594,17 @@ class LitReader(object): entry = DirectoryEntry(name, section, offset, size) self.entries[name] = entry - def _read_section_names(self): + def read_section_names(self): if '::DataSpace/NameList' not in self.entries: raise LitError('Lit file does not have a valid NameList') raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 - self.num_sections = u16(raw[2:pos]) - self.section_names = [""]*self.num_sections - self.section_data = [None]*self.num_sections - for section in xrange(self.num_sections): + num_sections = u16(raw[2:pos]) + self.section_names = [""] * num_sections + self.section_data = [None] * num_sections + for section in xrange(num_sections): size = u16(raw[pos:pos+2]) pos += 2 size = size*2 + 2 @@ -591,11 +614,12 @@ class LitReader(object): raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size - def _read_manifest(self): + def read_manifest(self): if '/manifest' not in self.entries: raise LitError('Lit file does not have a valid manifest') raw = self.get_file('/manifest') self.manifest = {} + self.paths = {self.opf_path: None} while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break @@ -634,28 +658,9 @@ class LitReader(object): for item in mlist: if item.path[0] == '/': item.path = os.path.basename(item.path) + self.paths[item.path] = item - def _pretty_print(self, xml): - f = cStringIO.StringIO(xml.encode('utf-8')) - doc = etree.parse(f, parser=self.XML_PARSER) - pretty = etree.tostring(doc, encoding='ascii', pretty_print=True) - return XML_DECL + unicode(pretty) - - def _read_meta(self): - path = 'content.opf' - raw = self.get_file('/meta') - xml = OPF_DECL - try: - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - except LitError: - if 'PENGUIN group' not in raw: raise - print "WARNING: attempting PENGUIN malformed OPF fix" - raw = raw.replace( - 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - self.meta = xml - - def _read_drm(self): + def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 @@ -666,7 +671,7 @@ class LitReader(object): else: return if self.drmlevel < 5: - msdes.deskey(self._calculate_deskey(), msdes.DE1) + msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') @@ -674,7 +679,7 @@ class LitReader(object): else: raise DRMError("Cannot access DRM-protected book") - def _calculate_deskey(self): + def calculate_deskey(self): hashfiles = ['/meta', '/DRMStorage/DRMSource'] if self.drmlevel == 3: hashfiles.append('/DRMStorage/DRMBookplate') @@ -698,18 +703,18 @@ class LitReader(object): def get_file(self, name): entry = self.entries[name] if entry.section == 0: - return self._read_content(entry.offset, entry.size) + return self.read_content(entry.offset, entry.size) section = self.get_section(entry.section) return section[entry.offset:entry.offset+entry.size] def get_section(self, section): data = self.section_data[section] if not data: - data = self._get_section(section) + data = self.get_section_uncached(section) self.section_data[section] = data return data - def _get_section(self, section): + def get_section_uncached(self, section): name = self.section_names[section] path = '::DataSpace/Storage/' + name transform = self.get_file(path + '/Transform/List') @@ -721,29 +726,29 @@ class LitReader(object): raise LitError("ControlData is too short") guid = msguid(transform) if guid == DESENCRYPT_GUID: - content = self._decrypt(content) + content = self.decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file( '/'.join(('::DataSpace/Storage', name, 'Transform', LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) - content = self._decompress(content, control, reset_table) + content = self.decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) transform = transform[16:] return content - def _decrypt(self, content): + def decrypt(self, content): length = len(content) extra = length & 0x7 if extra > 0: - self._warn("content length not a multiple of block size") + self.warn("content length not a multiple of block size") content += "\0" * (8 - extra) msdes.deskey(self.bookkey, msdes.DE1) return msdes.des(content) - def _decompress(self, content, control, reset_table): + def decompress(self, content, control, reset_table): if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): @@ -784,7 +789,7 @@ class LitReader(object): result.append( lzx.decompress(content[base:size], window_bytes)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) @@ -794,55 +799,88 @@ class LitReader(object): try: result.append(lzx.decompress(content[base:], bytes_remaining)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") return ''.join(result) - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - content = decl + unicode(UnBinary(raw, path, self.manifest, map)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') - else: - name = '/'.join(('/data', entry.internal)) - content = self.get_file(name) - return content - - def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): - output_dir = os.path.abspath(output_dir) - try: - opf_path = os.path.splitext( - os.path.basename(self._stream.name))[0] + '.opf' - except AttributeError: - opf_path = 'content.opf' - opf_path = os.path.join(output_dir, opf_path) - self._ensure_dir(opf_path) - with open(opf_path, 'wb') as f: - xml = self.meta - if pretty_print: - xml = self._pretty_print(xml) - f.write(xml.encode('utf-8')) - for entry in self.manifest.values(): - path = os.path.join(output_dir, entry.path) - self._ensure_dir(path) - with open(path, 'wb') as f: - f.write(self.get_entry_content(entry, pretty_print)) +class LitReader(object): + def __init__(self, filename_or_stream): + self._litfile = LitFile(filename_or_stream) + + def namelist(self): + return self._litfile.paths.keys() + + def exists(self, name): + return urlunquote(name) in self._litfile.paths + + def read_xml(self, name): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + content = self._read_meta() + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = unbin.tree + else: + raise LitError('Requested non-XML content as XML') + return content + + def read(self, name, pretty_print=False): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + meta = self._read_meta() + content = OPF_DECL + etree.tostring( + meta, encoding='ascii', pretty_print=pretty_print) + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = HTML_DECL + if pretty_print: + content += etree.tostring(unbin.tree, + encoding='ascii', pretty_print=True) + else: + content += str(unbin) + else: + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) + return content + + def meta(): + def fget(self): + return self.read(self._litfile.opf_path) + return property(fget=fget) + meta = meta() + def _ensure_dir(self, path): dir = os.path.dirname(path) if not os.path.isdir(dir): os.makedirs(dir) + + def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): + for name in self.namelist(): + path = os.path.join(output_dir, name) + self._ensure_dir(path) + with open(path, 'wb') as f: + f.write(self.read(name, pretty_print=pretty_print)) + + def _read_meta(self): + path = 'content.opf' + raw = self._litfile.get_file('/meta') + try: + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + except LitError: + if 'PENGUIN group' not in raw: raise + print "WARNING: attempting PENGUIN malformed OPF fix" + raw = raw.replace( + 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + return unbin.tree - def _warn(self, msg): - print "WARNING: %s" % (msg,) def option_parser(): from calibre.utils.config import OptionParser @@ -852,7 +890,8 @@ def option_parser(): help=_('Output directory. Defaults to current directory.')) parser.add_option( '-p', '--pretty-print', default=False, action='store_true', - help=_('Legibly format extracted markup. May modify meaningful whitespace.')) + help=_('Legibly format extracted markup.' \ + ' May modify meaningful whitespace.')) parser.add_option( '--verbose', default=False, action='store_true', help=_('Useful for debugging.')) From cba3bb55e4108842d9e10ff5d9cc75e2f15b0361 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 17 Jan 2009 14:43:16 -0500 Subject: [PATCH 2/7] Minor clean-ups to CSS flattening --- src/calibre/ebooks/oeb/transforms/flatcss.py | 31 ++++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 7110c2db2d..375003c1a5 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -144,7 +144,8 @@ class CSSFlattener(object): value = round(value / slineh) * dlineh cssdict[property] = "%0.5fem" % (value / fsize) - def flatten_node(self, node, stylizer, names, styles, psize, left=0): + def flatten_node(self, node, stylizer, names, styles, psize, left=0, + valigned=False): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: return @@ -154,18 +155,6 @@ class CSSFlattener(object): if 'align' in node.attrib: cssdict['text-align'] = node.attrib['align'] del node.attrib['align'] - if node.tag == XHTML('font'): - node.tag = XHTML('span') - if 'size' in node.attrib: - size = node.attrib['size'].strip() - if size: - fnums = self.context.source.fnums - if size[0] in ('+', '-'): - # Oh, the warcrimes - cssdict['font-size'] = fnums[3+int(size)] - else: - cssdict['font-size'] = fnums[int(size)] - del node.attrib['size'] if 'color' in node.attrib: cssdict['color'] = node.attrib['color'] del node.attrib['color'] @@ -173,7 +162,7 @@ class CSSFlattener(object): cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] if cssdict: - if 'font-size' in cssdict: + if 'font-size' in cssdict or tag == 'body': fsize = self.fmap[style['font-size']] cssdict['font-size'] = "%0.5fem" % (fsize / psize) psize = fsize @@ -197,10 +186,13 @@ class CSSFlattener(object): cssdict['display'] = 'inline' else: cssdict['display'] = 'block' - if 'vertical-align' in cssdict \ - and cssdict['vertical-align'] == 'sup': - cssdict['vertical-align'] = 'super' - if self.lineh and 'line-height' not in cssdict: + if 'vertical-align' in cssdict: + if cssdict['vertical-align'] == 'sup': + cssdict['vertical-align'] = 'text-top' + if style['vertical-align'] != 'baseline': + cssdict['line-height'] = '0' + valigned = True + if self.lineh and 'line-height' not in cssdict and not valigned: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if cssdict: @@ -220,7 +212,8 @@ class CSSFlattener(object): if 'style' in node.attrib: del node.attrib['style'] for child in node: - self.flatten_node(child, stylizer, names, styles, psize, left) + self.flatten_node(child, stylizer, names, styles, psize, left, + valigned) def flatten_head(self, item, stylizer, href): html = item.data From 76de6aef24f99929957676fde5e98f86f209345b Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 18 Jan 2009 21:44:43 -0500 Subject: [PATCH 3/7] Use etree.html to handle HTML entities and not UTF-8 encodings --- src/calibre/ebooks/oeb/base.py | 20 +++++++------------- src/calibre/ebooks/oeb/transforms/flatcss.py | 8 ++++---- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 4248657e23..a903136610 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote import logging import re -import htmlentitydefs import uuid import copy from lxml import etree +from lxml import html from calibre import LoggingInterface XML_PARSER = etree.XMLParser(recover=True) @@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) MS_COVER_TYPE = 'other.ms-coverimage-standard' -recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace') -ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items()) -del ENTITYDEFS['lt'] -del ENTITYDEFS['gt'] -del ENTITYDEFS['quot'] -del ENTITYDEFS['amp'] -del recode - def element(parent, *args, **kwargs): if parent is not None: @@ -298,7 +290,6 @@ class Metadata(object): class Manifest(object): class Item(object): - ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') def __init__(self, id, href, media_type, @@ -317,9 +308,12 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = self.ENTITY_RE.sub(repl, data) - data = etree.fromstring(data, parser=XML_PARSER) + try: + data = etree.fromstring(data, parser=XML_PARSER) + except etree.XMLSyntaxError: + data = html.fromstring(data, parser=XML_PARSER) + data = etree.tostring(data, encoding=unicode) + data = etree.fromstring(data, parser=XML_PARSER) if namespace(data.tag) != XHTML_NS: data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 375003c1a5..4877b28f51 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -161,11 +161,11 @@ class CSSFlattener(object): if 'bgcolor' in node.attrib: cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] + if 'font-size' in cssdict or tag == 'body': + fsize = self.fmap[style['font-size']] + cssdict['font-size'] = "%0.5fem" % (fsize / psize) + psize = fsize if cssdict: - if 'font-size' in cssdict or tag == 'body': - fsize = self.fmap[style['font-size']] - cssdict['font-size'] = "%0.5fem" % (fsize / psize) - psize = fsize if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) margin = style['margin-left'] From 105d431f6c877a3cdb5c231f64d1757bdf39b526 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 7 Feb 2009 10:03:00 -0500 Subject: [PATCH 4/7] Refactor OPF de-serialization into OEBReader. --- src/calibre/ebooks/oeb/base.py | 644 ++++-------------------------- src/calibre/ebooks/oeb/factory.py | 20 + src/calibre/ebooks/oeb/reader.py | 535 +++++++++++++++++++++++++ src/calibre/ebooks/oeb/writer.py | 107 +++++ 4 files changed, 742 insertions(+), 564 deletions(-) create mode 100644 src/calibre/ebooks/oeb/factory.py create mode 100644 src/calibre/ebooks/oeb/reader.py create mode 100644 src/calibre/ebooks/oeb/writer.py diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 80d4797905..8eb73935a5 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -6,22 +6,18 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os, sys, re, uuid, copy -from mimetypes import types_map, guess_type +import os, sys, re, uuid +from mimetypes import types_map from collections import defaultdict -from types import StringTypes -from itertools import izip, count, chain +from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree, html import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate -from calibre.startup import get_lang from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS -from calibre.ebooks.metadata.epub import CoverRenderer -from calibre.ptempfile import TemporaryDirectory XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -39,14 +35,13 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' -XPNSMAP = { - 'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, - 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, - 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, - 'svg': SVG_NS, 'xl' : XLINK_NS - } -DC_PREFIXES = ('d11', 'd10', 'd09') - +XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, + 'svg': SVG_NS, 'xl' : XLINK_NS} +OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} +OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS, 'calibre': CALIBRE_NS} def XML(name): return '{%s}%s' % (XML_NS, name) @@ -105,7 +100,8 @@ SVG_MIME = types_map['.svg'] BINARY_MIME = 'application/octet-stream' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) -OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) +OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, + 'text/x-oeb-document']) OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME]) OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) @@ -167,8 +163,9 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def xml2str(root): - return etree.tostring(root, encoding='utf-8', xml_declaration=True) +def xml2str(root, pretty_print=False): + return etree.tostring(root, encoding='utf-8', xml_declaration=True, + pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) @@ -213,16 +210,38 @@ class Logger(LoggingInterface, object): return object.__getattribute__(self, 'log_' + name) -class AbstractContainer(object): - def read_xml(self, path): - return etree.fromstring( - self.read(path), base_url=os.path.dirname(path)) +class NullContainer(object): + def read(self, path): + raise OEBError('Attempt to read from NullContainer') -class DirContainer(AbstractContainer): - def __init__(self, rootdir): - self.rootdir = unicode(rootdir) + def write(self, path): + raise OEBError('Attempt to write to NullContainer') + + def exists(self, path): + return False + + def namelist(self): + return [] + +class DirContainer(object): + def __init__(self, path): + path = unicode(path) + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = os.path.basename(path) + self.rootdir = os.path.dirname(path) + return + self.rootdir = path + for path in self.namelist(): + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = fname + return + self.opfname = None def read(self, path): + if path is None: + path = self.opfname path = os.path.join(self.rootdir, path) with open(urlunquote(path), 'rb') as f: return f.read() @@ -239,33 +258,14 @@ class DirContainer(AbstractContainer): path = os.path.join(self.rootdir, path) return os.path.isfile(urlunquote(path)) -class DirWriter(object): - def __init__(self, version='2.0', page_map=False): - self.version = version - self.page_map = page_map - - def dump(self, oeb, path): - version = int(self.version[0]) - opfname = None - if os.path.splitext(path)[1].lower() == '.opf': - opfname = os.path.basename(path) - path = os.path.dirname(path) - if not os.path.isdir(path): - os.mkdir(path) - output = DirContainer(path) - for item in oeb.manifest.values(): - output.write(item.href, str(item)) - if version == 1: - metadata = oeb.to_opf1() - elif version == 2: - metadata = oeb.to_opf2(page_map=self.page_map) - else: - raise OEBError("Unrecognized OPF version %r" % self.version) - for mime, (href, data) in metadata.items(): - if opfname and mime == OPF_MIME: - href = opfname - output.write(href, xml2str(data)) - return + def namelist(self): + names = [] + for root, dirs, files in os.walk(self.rootdir): + for fname in files: + fname = os.path.join(root, fname) + fname = fname.replace('\\', '/') + names.append(fname) + return names class Metadata(object): @@ -279,9 +279,6 @@ class Metadata(object): OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} - OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} - OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, - 'xsi': XSI_NS, 'calibre': CALIBRE_NS} class Item(object): @@ -337,18 +334,20 @@ class Metadata(object): if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - scheme = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'), + scheme = Attribute(lambda term: 'scheme' if \ + term == OPF('meta') else OPF('scheme'), [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) id = Attribute('id') - type = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')]) + type = Attribute(XSI('type'), [DC('date'), DC('format'), + DC('type')]) lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'), - DC('creator'), DC('publisher'), - DC('relation'), DC('rights'), - DC('source'), DC('subject'), - OPF('meta')]) + DC('creator'), DC('publisher'), + DC('relation'), DC('rights'), + DC('source'), DC('subject'), + OPF('meta')]) def __getitem__(self, key): return self.attrib[key] @@ -445,21 +444,19 @@ class Metadata(object): return nsmap return property(fget=fget) - @apply def _opf2_nsmap(): def fget(self): nsmap = self._nsmap - nsmap.update(self.OPF2_NSMAP) + nsmap.update(OPF2_NSMAP) return nsmap return property(fget=fget) - def to_opf1(self, parent=None): nsmap = self._opf1_nsmap nsrmap = dict((value, key) for key, value in nsmap.items()) elem = element(parent, 'metadata', nsmap=nsmap) - dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) xmeta = element(elem, 'x-metadata') for term in self.items: for item in self.items[term]: @@ -608,7 +605,7 @@ class Manifest(object): def __str__(self): data = self.data if isinstance(data, etree._Element): - return xml2str(data) + return xml2str(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data.encode('utf-8') return str(data) @@ -756,7 +753,7 @@ class Spine(object): self.items = [] def _linear(self, linear): - if isinstance(linear, StringTypes): + if isinstance(linear, basestring): linear = linear.lower() if linear is None or linear in ('yes', 'true'): linear = True @@ -838,7 +835,7 @@ class Guide(object): ('text', __('Main Text'))] TYPES = set(t for t, _ in _TYPES_TITLES) TITLES = dict(_TYPES_TITLES) - ORDER = dict((t, i) for (t, _), i in izip(_TYPES_TITLES, count(0))) + ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES)) def __init__(self, oeb, type, title, href): self.oeb = oeb @@ -1044,493 +1041,25 @@ class PageList(object): class OEBBook(object): - COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') - COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') - - def __init__(self, opfpath=None, container=None, encoding=None, - logger=FauxLogger()): - if opfpath and not container: - container = DirContainer(os.path.dirname(opfpath)) - opfpath = os.path.basename(opfpath) - self.container = container + def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): self.encoding = encoding + self.pretty_print = pretty_print self.logger = logger - if opfpath or container: - opf = self._read_opf(opfpath) - self._all_from_opf(opf) - - def _clean_opf(self, opf): - nsmap = {} - for elem in opf.iter(tag=etree.Element): - nsmap.update(elem.nsmap) - for elem in opf.iter(tag=etree.Element): - if namespace(elem.tag) in ('', OPF1_NS): - elem.tag = OPF(barename(elem.tag)) - nsmap.update(Metadata.OPF2_NSMAP) - attrib = dict(opf.attrib) - nroot = etree.Element(OPF('package'), - nsmap={None: OPF2_NS}, attrib=attrib) - metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) - ignored = (OPF('dc-metadata'), OPF('x-metadata')) - for elem in xpath(opf, 'o2:metadata//*'): - if elem.tag in ignored: - continue - if namespace(elem.tag) in DC_NSES: - tag = barename(elem.tag).lower() - elem.tag = '{%s}%s' % (DC11_NS, tag) - metadata.append(elem) - for element in xpath(opf, 'o2:metadata//o2:meta'): - metadata.append(element) - for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): - for element in xpath(opf, tag): - nroot.append(element) - return nroot - - def _read_opf(self, opfpath): - data = self.container.read(opfpath) - data = self.decode(data) - data = XMLDECL_RE.sub('', data) - try: - opf = etree.fromstring(data) - except etree.XMLSyntaxError: - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) - opf = etree.fromstring(data) - self.logger.warn('OPF contains invalid HTML named entities') - ns = namespace(opf.tag) - if ns not in ('', OPF1_NS, OPF2_NS): - raise OEBError('Invalid namespace %r for OPF document' % ns) - opf = self._clean_opf(opf) - return opf - - def _metadata_from_opf(self, opf): - uid = opf.get('unique-identifier', None) + self.version = '2.0' + self.container = NullContainer() + self.metadata = Metadata(self) self.uid = None - self.metadata = metadata = Metadata(self) - for elem in xpath(opf, '/o2:package/o2:metadata//*'): - term = elem.tag - value = elem.text - attrib = dict(elem.attrib) - nsmap = elem.nsmap - if term == OPF('meta'): - term = qname(attrib.pop('name', None), nsmap) - value = attrib.pop('content', None) - if value: - value = COLLAPSE_RE.sub(' ', value.strip()) - if term and (value or attrib): - metadata.add(term, value, attrib, nsmap=nsmap) - haveuuid = haveid = False - for ident in metadata.identifier: - if unicode(ident).startswith('urn:uuid:'): - haveuuid = True - if 'id' in ident.attrib: - haveid = True - if not (haveuuid and haveid): - bookid = "urn:uuid:%s" % str(uuid.uuid4()) - metadata.add('identifier', bookid, id='calibre-uuid') - if uid is None: - self.logger.warn(u'Unique-identifier not specified') - for item in metadata.identifier: - if not item.id: - continue - if uid is None or item.id == uid: - self.uid = item - break - else: - self.logger.warn(u'Unique-identifier %r not found' % uid) - for ident in metadata.identifier: - if 'id' in ident.attrib: - self.uid = metadata.identifier[0] - break - if not metadata.language: - self.logger.warn(u'Language not specified') - metadata.add('language', get_lang()) - if not metadata.creator: - self.logger.warn('Creator not specified') - metadata.add('creator', self.translate(__('Unknown'))) - if not metadata.title: - self.logger.warn('Title not specified') - metadata.add('title', self.translate(__('Unknown'))) - - def _manifest_add_missing(self): - manifest = self.manifest - known = set(manifest.hrefs) - unchecked = set(manifest.values()) - while unchecked: - new = set() - for item in unchecked: - if (item.media_type in OEB_DOCS or - item.media_type[-4:] in ('/xml', '+xml')) and \ - item.data is not None: - hrefs = [sel(item.data) for sel in LINK_SELECTORS] - for href in chain(*hrefs): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data): - href, _ = urldefrag(match.group('url')) - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - unchecked.clear() - for href in new: - known.add(href) - if not self.container.exists(href): - self.logger.warn('Referenced file %r not found' % href) - continue - self.logger.warn('Referenced file %r not in manifest' % href) - id, _ = manifest.generate(id='added') - guessed = guess_type(href)[0] - media_type = guessed or BINARY_MIME - added = manifest.add(id, href, media_type) - unchecked.add(added) - - def _manifest_from_opf(self, opf): - self.manifest = manifest = Manifest(self) - for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): - id = elem.get('id') - href = elem.get('href') - media_type = elem.get('media-type', None) - if media_type is None: - media_type = elem.get('mediatype', None) - if media_type is None or media_type == 'text/xml': - guessed = guess_type(href)[0] - media_type = guessed or media_type or BINARY_MIME - fallback = elem.get('fallback') - if href in manifest.hrefs: - self.logger.warn(u'Duplicate manifest entry for %r' % href) - continue - if not self.container.exists(href): - self.logger.warn(u'Manifest item %r not found' % href) - continue - if id in manifest.ids: - self.logger.warn(u'Duplicate manifest id %r' % id) - id, href = manifest.generate(id, href) - manifest.add(id, href, media_type, fallback) - self._manifest_add_missing() - - def _spine_add_extra(self): - manifest = self.manifest - spine = self.spine - unchecked = set(spine) - selector = XPath('h:body//h:a/@href') - extras = set() - while unchecked: - new = set() - for item in unchecked: - if item.media_type not in OEB_DOCS: - # TODO: handle fallback chains - continue - for href in selector(item.data): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - if href not in manifest.hrefs: - continue - found = manifest.hrefs[href] - if found.media_type not in OEB_DOCS or \ - found in spine or found in extras: - continue - new.add(found) - extras.update(new) - unchecked = new - version = int(self.version[0]) - for item in sorted(extras): - if version >= 2: - self.logger.warn( - 'Spine-referenced file %r not in spine' % item.href) - spine.add(item, linear=False) - - def _spine_from_opf(self, opf): - self.spine = spine = Spine(self) - for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): - idref = elem.get('idref') - if idref not in self.manifest: - self.logger.warn(u'Spine item %r not found' % idref) - continue - item = self.manifest[idref] - spine.add(item, elem.get('linear')) - if len(spine) == 0: - raise OEBError("Spine is empty") - self._spine_add_extra() - - def _guide_from_opf(self, opf): - self.guide = guide = Guide(self) - for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): - href = elem.get('href') - path = urldefrag(href)[0] - if path not in self.manifest.hrefs: - self.logger.warn(u'Guide reference %r not found' % href) - continue - guide.add(elem.get('type'), elem.get('title'), href) - - def _find_ncx(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@toc') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == NCX_MIME: - self.manifest.remove(item) - return item - return None - - def _toc_from_navpoint(self, item, toc, navpoint): - children = xpath(navpoint, 'ncx:navPoint') - for child in children: - title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - href = xpath(child, 'ncx:content/@src') - if not title or not href: - continue - href = item.abshref(urlnormalize(href[0])) - path, _ = urldefrag(href) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = child.get('id') - klass = child.get('class') - node = toc.add(title, href, id=id, klass=klass) - self._toc_from_navpoint(item, node, child) - - def _toc_from_ncx(self, item): - if item is None: - return False - ncx = item.data - title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - title = title or unicode(self.metadata.title[0]) - self.toc = toc = TOC(title) - navmaps = xpath(ncx, 'ncx:navMap') - for navmap in navmaps: - self._toc_from_navpoint(item, toc, navmap) - return True - - def _toc_from_tour(self, opf): - result = xpath(opf, 'o2:tours/o2:tour') - if not result: - return False - tour = result[0] - self.toc = toc = TOC(tour.get('title')) - sites = xpath(tour, 'o2:site') - for site in sites: - title = site.get('title') - href = site.get('href') - if not title or not href: - continue - path, _ = urldefrag(urlnormalize(href)) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = site.get('id') - toc.add(title, href, id=id) - return True - - def _toc_from_html(self, opf): - if 'toc' not in self.guide: - return False - self.toc = toc = TOC() - itempath, frag = urldefrag(self.guide['toc'].href) - item = self.manifest.hrefs[itempath] - html = item.data - if frag: - elems = xpath(html, './/*[@id="%s"]' % frag) - if not elems: - elems = xpath(html, './/*[@name="%s"]' % frag) - elem = elems[0] if elems else html - while elem != html and not xpath(elem, './/h:a[@href]'): - elem = elem.getparent() - html = elem - titles = defaultdict(list) - order = [] - for anchor in xpath(html, './/h:a[@href]'): - href = anchor.attrib['href'] - href = item.abshref(urlnormalize(href)) - path, frag = urldefrag(href) - if path not in self.manifest.hrefs: - continue - title = ' '.join(xpath(anchor, './/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if href not in titles: - order.append(href) - titles[href].append(title) - for href in order: - toc.add(' '.join(titles[href]), href) - return True - - def _toc_from_spine(self, opf): - self.toc = toc = TOC() - titles = [] - headers = [] - for item in self.spine: - if not item.linear: continue - html = item.data - title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if title: - titles.append(title) - headers.append('(unlabled)') - for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): - expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html, expr % tag)) - header = COLLAPSE_RE.sub(' ', header.strip()) - if header: - headers[-1] = header - break - use = titles - if len(titles) > len(set(titles)): - use = headers - for title, item in izip(use, self.spine): - if not item.linear: continue - toc.add(title, item.href) - return True - - def _toc_from_opf(self, opf, item): - if self._toc_from_ncx(item): return - if self._toc_from_tour(opf): return - self.logger.warn('No metadata table of contents found') - if self._toc_from_html(opf): return - self._toc_from_spine(opf) - - def _pages_from_ncx(self, opf, item): - if item is None: - return False - ncx = item.data - ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') - if not ptargets: - return False - pages = self.pages = PageList() - for ptarget in ptargets: - name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) - name = COLLAPSE_RE.sub(' ', name.strip()) - href = xpath(ptarget, 'ncx:content/@src') - if not href: - continue - href = item.abshref(urlnormalize(href[0])) - id = ptarget.get('id') - type = ptarget.get('type', 'normal') - klass = ptarget.get('class') - pages.add(name, href, type=type, id=id, klass=klass) - return True - - def _find_page_map(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@page-map') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == PAGE_MAP_MIME: - self.manifest.remove(item) - return item - return None - - def _pages_from_page_map(self, opf): - item = self._find_page_map(opf) - if item is None: - return False - pmap = item.data - pages = self.pages = PageList() - for page in xpath(pmap, 'o2:page'): - name = page.get('name', '') - href = page.get('href') - if not href: - continue - name = COLLAPSE_RE.sub(' ', name.strip()) - href = item.abshref(urlnormalize(href)) - type = 'normal' - if not name: - type = 'special' - elif name.lower().strip('ivxlcdm') == '': - type = 'front' - pages.add(name, href, type=type) - return True - - def _pages_from_opf(self, opf, item): - if self._pages_from_ncx(opf, item): return - if self._pages_from_page_map(opf): return + self.manifest = Manifest(self) + self.spine = Spine(self) + self.guide = Guide(self) + self.toc = TOC() self.pages = PageList() - return - - def _cover_from_html(self, hcover): - with TemporaryDirectory('_html_cover') as tdir: - writer = DirWriter() - writer.dump(self, tdir) - path = os.path.join(tdir, urlunquote(hcover.href)) - renderer = CoverRenderer(path) - data = renderer.image_data - id, href = self.manifest.generate('cover', 'cover.jpeg') - item = self.manifest.add(id, href, JPEG_MIME, data=data) - return item - - def _locate_cover_image(self): - if self.metadata.cover: - id = str(self.metadata.cover[0]) - item = self.manifest.ids.get(id, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - else: - self.logger.warn('Invalid cover image @id %r' % id) - hcover = self.spine[0] - if 'cover' in self.guide: - href = self.guide['cover'].href - item = self.manifest.hrefs[href] - media_type = item.media_type - if media_type in OEB_IMAGES: - return item - elif media_type in OEB_DOCS: - hcover = item - html = hcover.data - if MS_COVER_TYPE in self.guide: - href = self.guide[MS_COVER_TYPE].href - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - if self.COVER_SVG_XP(html): - svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) - href = os.path.splitext(hcover.href)[0] + '.svg' - id, href = self.manifest.generate(hcover.id, href) - item = self.manifest.add(id, href, SVG_MIME, data=svg) - return item - if self.COVER_OBJECT_XP(html): - object = self.COVER_OBJECT_XP(html)[0] - href = hcover.abshref(object.get('data')) - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - return self._cover_from_html(hcover) - - def _ensure_cover_image(self): - cover = self._locate_cover_image() - if self.metadata.cover: - self.metadata.cover[0].value = cover.id - return - self.metadata.add('cover', cover.id) - - def _all_from_opf(self, opf): - self.version = opf.get('version', '1.2') - self._metadata_from_opf(opf) - self._manifest_from_opf(opf) - self._spine_from_opf(opf) - self._guide_from_opf(opf) - item = self._find_ncx(opf) - self._toc_from_opf(opf, item) - self._pages_from_opf(opf, item) - self._ensure_cover_image() + + @classmethod + def generate(cls, opts): + encoding = opts.encoding + pretty_print = opts.pretty_print + return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): lang = str(self.metadata.language[0]) @@ -1652,16 +1181,3 @@ class OEBBook(object): spine.attrib['page-map'] = id results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) return results - - -def main(argv=sys.argv): - for arg in argv[1:]: - oeb = OEBBook(arg) - for name, doc in oeb.to_opf1().values(): - print etree.tostring(doc, pretty_print=True) - for name, doc in oeb.to_opf2(page_map=True).values(): - print etree.tostring(doc, pretty_print=True) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py new file mode 100644 index 0000000000..dcb0942e85 --- /dev/null +++ b/src/calibre/ebooks/oeb/factory.py @@ -0,0 +1,20 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import os +from calibre.ebooks.oeb.base import OEBError +from calibre.ebooks.oeb.reader import OEBReader + +__all__ = ['get_reader'] + +READER_REGISTRY = { + '.opf': OEBReader, + } + +def ReaderFactory(path): + ext = os.path.splitext(path)[1].lower() + if not ext: + return OEBReader + return READER_REGISTRY[ext]() diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py new file mode 100644 index 0000000000..2d22ff0cd2 --- /dev/null +++ b/src/calibre/ebooks/oeb/reader.py @@ -0,0 +1,535 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, uuid, copy +from itertools import izip, chain +from urlparse import urldefrag, urlparse +from urllib import unquote as urlunquote +from mimetypes import guess_type +from collections import defaultdict +from lxml import etree +from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ + DC_NSES, OPF +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ + PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME +from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ + ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE +from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath +from calibre.ebooks.oeb.base import urlnormalize, xml2str +from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer +from calibre.ebooks.oeb.writer import OEBWriter +from calibre.ebooks.oeb.entitydefs import ENTITYDEFS +from calibre.ebooks.metadata.epub import CoverRenderer +from calibre.startup import get_lang +from calibre.ptempfile import TemporaryDirectory + +__all__ = ['OEBReader'] + +class OEBReader(object): + + COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') + COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') + + Container = DirContainer + + DEFAULT_PROFILE = 'PRS505' + + def __call__(self, oeb, path): + self.oeb = oeb + self.logger = oeb.logger + oeb.container = self.Container(path) + opf = self._read_opf() + self._all_from_opf(opf) + return oeb + + def _clean_opf(self, opf): + nsmap = {} + for elem in opf.iter(tag=etree.Element): + nsmap.update(elem.nsmap) + for elem in opf.iter(tag=etree.Element): + if namespace(elem.tag) in ('', OPF1_NS): + elem.tag = OPF(barename(elem.tag)) + nsmap.update(OPF2_NSMAP) + attrib = dict(opf.attrib) + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, attrib=attrib) + metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) + ignored = (OPF('dc-metadata'), OPF('x-metadata')) + for elem in xpath(opf, 'o2:metadata//*'): + if elem.tag in ignored: + continue + if namespace(elem.tag) in DC_NSES: + tag = barename(elem.tag).lower() + elem.tag = '{%s}%s' % (DC11_NS, tag) + metadata.append(elem) + for element in xpath(opf, 'o2:metadata//o2:meta'): + metadata.append(element) + for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): + for element in xpath(opf, tag): + nroot.append(element) + return nroot + + def _read_opf(self): + data = self.oeb.container.read(None) + data = self.oeb.decode(data) + data = XMLDECL_RE.sub('', data) + try: + opf = etree.fromstring(data) + except etree.XMLSyntaxError: + repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) + data = ENTITY_RE.sub(repl, data) + opf = etree.fromstring(data) + self.logger.warn('OPF contains invalid HTML named entities') + ns = namespace(opf.tag) + if ns not in ('', OPF1_NS, OPF2_NS): + raise OEBError('Invalid namespace %r for OPF document' % ns) + opf = self._clean_opf(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.get('unique-identifier', None) + self.oeb.uid = None + metadata = self.oeb.metadata + for elem in xpath(opf, '/o2:package/o2:metadata//*'): + term = elem.tag + value = elem.text + attrib = dict(elem.attrib) + nsmap = elem.nsmap + if term == OPF('meta'): + term = qname(attrib.pop('name', None), nsmap) + value = attrib.pop('content', None) + if value: + value = COLLAPSE_RE.sub(' ', value.strip()) + if term and (value or attrib): + metadata.add(term, value, attrib, nsmap=nsmap) + haveuuid = haveid = False + for ident in metadata.identifier: + if unicode(ident).startswith('urn:uuid:'): + haveuuid = True + if 'id' in ident.attrib: + haveid = True + if not (haveuuid and haveid): + bookid = "urn:uuid:%s" % str(uuid.uuid4()) + metadata.add('identifier', bookid, id='calibre-uuid') + if uid is None: + self.logger.warn(u'Unique-identifier not specified') + for item in metadata.identifier: + if not item.id: + continue + if uid is None or item.id == uid: + self.oeb.uid = item + break + else: + self.logger.warn(u'Unique-identifier %r not found' % uid) + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.oeb.uid = metadata.identifier[0] + break + if not metadata.language: + self.logger.warn(u'Language not specified') + metadata.add('language', get_lang()) + if not metadata.creator: + self.logger.warn('Creator not specified') + metadata.add('creator', self.oeb.translate(__('Unknown'))) + if not metadata.title: + self.logger.warn('Title not specified') + metadata.add('title', self.oeb.translate(__('Unknown'))) + + def _manifest_add_missing(self): + manifest = self.oeb.manifest + known = set(manifest.hrefs) + unchecked = set(manifest.values()) + while unchecked: + new = set() + for item in unchecked: + if (item.media_type in OEB_DOCS or + item.media_type[-4:] in ('/xml', '+xml')) and \ + item.data is not None: + hrefs = [sel(item.data) for sel in LINK_SELECTORS] + for href in chain(*hrefs): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + elif item.media_type in OEB_STYLES: + for match in CSSURL_RE.finditer(item.data): + href, _ = urldefrag(match.group('url')) + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + unchecked.clear() + for href in new: + known.add(href) + if not self.oeb.container.exists(href): + self.logger.warn('Referenced file %r not found' % href) + continue + self.logger.warn('Referenced file %r not in manifest' % href) + id, _ = manifest.generate(id='added') + guessed = guess_type(href)[0] + media_type = guessed or BINARY_MIME + added = manifest.add(id, href, media_type) + unchecked.add(added) + + def _manifest_from_opf(self, opf): + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + id = elem.get('id') + href = elem.get('href') + media_type = elem.get('media-type', None) + if media_type is None: + media_type = elem.get('mediatype', None) + if media_type is None or media_type == 'text/xml': + guessed = guess_type(href)[0] + media_type = guessed or media_type or BINARY_MIME + fallback = elem.get('fallback') + if href in manifest.hrefs: + self.logger.warn(u'Duplicate manifest entry for %r' % href) + continue + if not self.oeb.container.exists(href): + self.logger.warn(u'Manifest item %r not found' % href) + continue + if id in manifest.ids: + self.logger.warn(u'Duplicate manifest id %r' % id) + id, href = manifest.generate(id, href) + manifest.add(id, href, media_type, fallback) + self._manifest_add_missing() + + def _spine_add_extra(self): + manifest = self.oeb.manifest + spine = self.oeb.spine + unchecked = set(spine) + selector = XPath('h:body//h:a/@href') + extras = set() + while unchecked: + new = set() + for item in unchecked: + if item.media_type not in OEB_DOCS: + # TODO: handle fallback chains + continue + for href in selector(item.data): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + if href not in manifest.hrefs: + continue + found = manifest.hrefs[href] + if found.media_type not in OEB_DOCS or \ + found in spine or found in extras: + continue + new.add(found) + extras.update(new) + unchecked = new + version = int(self.oeb.version[0]) + for item in sorted(extras): + if version >= 2: + self.logger.warn( + 'Spine-referenced file %r not in spine' % item.href) + spine.add(item, linear=False) + + def _spine_from_opf(self, opf): + spine = self.oeb.spine + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + idref = elem.get('idref') + if idref not in manifest.ids: + self.logger.warn(u'Spine item %r not found' % idref) + continue + item = manifest.ids[idref] + spine.add(item, elem.get('linear')) + if len(spine) == 0: + raise OEBError("Spine is empty") + self._spine_add_extra() + + def _guide_from_opf(self, opf): + guide = self.oeb.guide + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + href = elem.get('href') + path = urldefrag(href)[0] + if path not in manifest.hrefs: + self.logger.warn(u'Guide reference %r not found' % href) + continue + guide.add(elem.get('type'), elem.get('title'), href) + + def _find_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == NCX_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _toc_from_navpoint(self, item, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + href = xpath(child, 'ncx:content/@src') + if not title or not href: + continue + href = item.abshref(urlnormalize(href[0])) + path, _ = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(item, node, child) + + def _toc_from_ncx(self, item): + if item is None: + return False + ncx = item.data + title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + title = title or unicode(self.oeb.metadata.title[0]) + toc = self.oeb.toc + toc.title = title + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(item, toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, 'o2:tours/o2:tour') + if not result: + return False + tour = result[0] + toc = self.oeb.toc + toc.title = tour.get('title') + sites = xpath(tour, 'o2:site') + for site in sites: + title = site.get('title') + href = site.get('href') + if not title or not href: + continue + path, _ = urldefrag(urlnormalize(href)) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = site.get('id') + toc.add(title, href, id=id) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.oeb.guide: + return False + itempath, frag = urldefrag(self.oeb.guide['toc'].href) + item = self.oeb.manifest.hrefs[itempath] + html = item.data + if frag: + elems = xpath(html, './/*[@id="%s"]' % frag) + if not elems: + elems = xpath(html, './/*[@name="%s"]' % frag) + elem = elems[0] if elems else html + while elem != html and not xpath(elem, './/h:a[@href]'): + elem = elem.getparent() + html = elem + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + href = item.abshref(urlnormalize(href)) + path, frag = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + continue + title = ' '.join(xpath(anchor, './/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if href not in titles: + order.append(href) + titles[href].append(title) + toc = self.oeb.toc + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + toc = self.oeb.toc + titles = [] + headers = [] + for item in self.oeb.spine: + if not item.linear: continue + html = item.data + title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if title: + titles.append(title) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' + header = ''.join(xpath(html, expr % tag)) + header = COLLAPSE_RE.sub(' ', header.strip()) + if header: + headers[-1] = header + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.oeb.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf, item): + if self._toc_from_ncx(item): return + if self._toc_from_tour(opf): return + self.logger.warn('No metadata table of contents found') + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _pages_from_ncx(self, opf, item): + if item is None: + return False + ncx = item.data + ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') + if not ptargets: + return False + pages = self.oeb.pages + for ptarget in ptargets: + name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) + name = COLLAPSE_RE.sub(' ', name.strip()) + href = xpath(ptarget, 'ncx:content/@src') + if not href: + continue + href = item.abshref(urlnormalize(href[0])) + id = ptarget.get('id') + type = ptarget.get('type', 'normal') + klass = ptarget.get('class') + pages.add(name, href, type=type, id=id, klass=klass) + return True + + def _find_page_map(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@page-map') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == PAGE_MAP_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _pages_from_page_map(self, opf): + item = self._find_page_map(opf) + if item is None: + return False + pmap = item.data + pages = self.oeb.pages + for page in xpath(pmap, 'o2:page'): + name = page.get('name', '') + href = page.get('href') + if not href: + continue + name = COLLAPSE_RE.sub(' ', name.strip()) + href = item.abshref(urlnormalize(href)) + type = 'normal' + if not name: + type = 'special' + elif name.lower().strip('ivxlcdm') == '': + type = 'front' + pages.add(name, href, type=type) + return True + + def _pages_from_opf(self, opf, item): + if self._pages_from_ncx(opf, item): return + if self._pages_from_page_map(opf): return + return + + def _cover_from_html(self, hcover): + with TemporaryDirectory('_html_cover') as tdir: + writer = OEBWriter() + writer(self.oeb, tdir) + path = os.path.join(tdir, urlunquote(hcover.href)) + renderer = CoverRenderer(path) + data = renderer.image_data + id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') + item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) + return item + + def _locate_cover_image(self): + if self.oeb.metadata.cover: + id = str(self.oeb.metadata.cover[0]) + item = self.oeb.manifest.ids.get(id, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + else: + self.logger.warn('Invalid cover image @id %r' % id) + hcover = self.oeb.spine[0] + if 'cover' in self.oeb.guide: + href = self.oeb.guide['cover'].href + item = self.oeb.manifest.hrefs[href] + media_type = item.media_type + if media_type in OEB_IMAGES: + return item + elif media_type in OEB_DOCS: + hcover = item + html = hcover.data + if MS_COVER_TYPE in self.oeb.guide: + href = self.oeb.guide[MS_COVER_TYPE].href + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + if self.COVER_SVG_XP(html): + svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) + href = os.path.splitext(hcover.href)[0] + '.svg' + id, href = self.oeb.manifest.generate(hcover.id, href) + item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) + return item + if self.COVER_OBJECT_XP(html): + object = self.COVER_OBJECT_XP(html)[0] + href = hcover.abshref(object.get('data')) + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + return self._cover_from_html(hcover) + + def _ensure_cover_image(self): + cover = self._locate_cover_image() + if self.oeb.metadata.cover: + self.oeb.metadata.cover[0].value = cover.id + return + self.oeb.metadata.add('cover', cover.id) + + def _all_from_opf(self, opf): + self.oeb.version = opf.get('version', '1.2') + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + item = self._find_ncx(opf) + self._toc_from_opf(opf, item) + self._pages_from_opf(opf, item) + self._ensure_cover_image() + + +def main(argv=sys.argv): + reader = OEBReader() + for arg in argv[1:]: + oeb = reader(OEBBook(), arg) + for name, doc in oeb.to_opf1().values(): + print etree.tostring(doc, pretty_print=True) + for name, doc in oeb.to_opf2(page_map=True).values(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py new file mode 100644 index 0000000000..e55db670d6 --- /dev/null +++ b/src/calibre/ebooks/oeb/writer.py @@ -0,0 +1,107 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, logging +from calibre.ebooks.oeb.base import OPF_MIME, xml2str +from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook +from calibre.utils.config import Config + +__all__ = ['OEBWriter'] + +class OEBWriter(object): + DEFAULT_PROFILE = 'PRS505' + + def __init__(self, version='2.0', page_map=False, pretty_print=False): + self.version = version + self.page_map = page_map + self.pretty_print = pretty_print + + @classmethod + def config(cls, cfg): + oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) + versions = ['1.2', '2.0'] + oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, + help=_('OPF version to generate. Default is %default.')) + oeb('adobe_page_map', ['--adobe-page-map'], default=False, + help=_('Generate an Adobe "page-map" file if pagination ' + 'information is avaliable.')) + return cfg + + @classmethod + def generate(cls, opts): + version = opts.opf_version + page_map = opts.adobe_page_map + pretty_print = opts.pretty_print + return cls(version=version, page_map=page_map, + pretty_print=pretty_print) + + def __call__(self, oeb, path): + version = int(self.version[0]) + opfname = None + if os.path.splitext(path)[1].lower() == '.opf': + opfname = os.path.basename(path) + path = os.path.dirname(path) + if not os.path.isdir(path): + os.mkdir(path) + output = DirContainer(path) + for item in oeb.manifest.values(): + output.write(item.href, str(item)) + if version == 1: + metadata = oeb.to_opf1() + elif version == 2: + metadata = oeb.to_opf2(page_map=self.page_map) + else: + raise OEBError("Unrecognized OPF version %r" % self.version) + pretty_print = self.pretty_print + for mime, (href, data) in metadata.items(): + if opfname and mime == OPF_MIME: + href = opfname + output.write(href, xml2str(data, pretty_print=pretty_print)) + return + + +def option_parser(): + cfg = Config('oeb', _('Options to control OEB conversion.')) + OEBWriter.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for files. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def any2oeb(opts, inpath): + from calibre.ebooks.oeb.factory import ReaderFactory + logger = Logger(logging.getLogger('any2oeb')) + logger.setup_cli_handler(opts.verbose) + outpath = opts.output + if outpath is None: + outpath = os.path.basename(inpath) + outpath = os.path.splitext(outpath)[0] + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + reader = ReaderFactory(inpath) + reader(oeb, inpath) + writer = OEBWriter.generate(opts) + writer(oeb, outpath) + return 0 + +def main(argv=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(argv[1:]) + if len(args) != 1: + parser.print_help() + return 1 + inpath = args[0] + retval = any2oeb(opts, inpath) + return retval + +if __name__ == '__main__': + sys.exit(main()) From e5984c02c7bc7ded3b2afd7aa4ff5e85a167dd03 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Tue, 10 Feb 2009 23:50:35 -0500 Subject: [PATCH 5/7] Document OEBBook. --- src/calibre/ebooks/lit/writer.py | 2 +- src/calibre/ebooks/oeb/base.py | 308 +++++++++++++++++++++++++++---- 2 files changed, 269 insertions(+), 41 deletions(-) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 4a059b6433..bebba8938b 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -312,7 +312,7 @@ class LitWriter(object): cover = None if oeb.metadata.cover: id = str(oeb.metadata.cover[0]) - cover = oeb.manifest[id] + cover = oeb.manifest.ids[id] for type, title in ALL_MS_COVER_TYPES: if type not in oeb.guide: oeb.guide.add(type, title, cover.href) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 771a27a81a..ce16fa76e5 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -5,6 +5,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' +__docformat__ = 'restructuredtext en' import os, sys, re, uuid from mimetypes import types_map @@ -175,6 +176,7 @@ URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE] def urlquote(href): + """Quote URL-unsafe characters, allowing IRI-safe characters.""" result = [] unsafe = 0 if isinstance(href, unicode) else 1 unsafe = URL_UNSAFE[unsafe] @@ -185,6 +187,9 @@ def urlquote(href): return ''.join(result) def urlnormalize(href): + """Convert a URL into normalized form, with all and only URL-unsafe + characters URL quoted. + """ parts = urlparse(href) if not parts.scheme: path, frag = urldefrag(href) @@ -196,21 +201,30 @@ def urlnormalize(href): class OEBError(Exception): + """Generic OEB-processing error.""" pass class FauxLogger(object): + """Fake logging interface.""" def __getattr__(self, name): return self def __call__(self, message): print message class Logger(LoggingInterface, object): + """A logging object which provides both the standard `logging.Logger` and + calibre-specific interfaces. + """ def __getattr__(self, name): return object.__getattribute__(self, 'log_' + name) class NullContainer(object): + """An empty container. + + For use with book formats which do not support container-like access. + """ def read(self, path): raise OEBError('Attempt to read from NullContainer') @@ -224,6 +238,8 @@ class NullContainer(object): return [] class DirContainer(object): + """Filesystem directory container.""" + def __init__(self, path): path = unicode(path) ext = os.path.splitext(path)[1].lower() @@ -269,20 +285,38 @@ class DirContainer(object): class Metadata(object): - DC_TERMS = set([ - 'contributor', 'coverage', 'creator', 'date', - 'description', 'format', 'identifier', 'language', - 'publisher', 'relation', 'rights', 'source', 'subject', - 'title', 'type' - ]) + """A collection of OEB data model metadata. + + Provides access to the list of items associated with a particular metadata + term via the term's local name using either Python container or attribute + syntax. Return an empty list for any terms with no currently associated + metadata items. + """ + + DC_TERMS = set(['contributor', 'coverage', 'creator', 'date', + 'description', 'format', 'identifier', 'language', + 'publisher', 'relation', 'rights', 'source', + 'subject', 'title', 'type']) CALIBRE_TERMS = set(['series', 'series_index', 'rating']) OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} class Item(object): - + """An item of OEB data model metadata. + + The metadata term or name may be accessed via the :attr:`term` or + :attr:`name` attributes. The metadata value or content may be accessed + via the :attr:`value` or :attr:`content` attributes, or via Unicode or + string representations of the object. + + OEB data model metadata attributes may be accessed either via their + fully-qualified names using the Python container access syntax, or via + their local names using Python attribute syntax. Only attributes + allowed by the OPF 2.0 specification are supported. + """ class Attribute(object): + """Smart accessor for allowed OEB metadata item attributes.""" def __init__(self, attr, allowed=None): if not callable(attr): @@ -333,10 +367,24 @@ class Metadata(object): nsattr = 'scheme' if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - + + @dynamic_property + def name(self): + def fget(self): + return self.term + return property(fget=fget) + + @dynamic_property + def content(self): + def fget(self): + return self.value + def fset(self, value): + self.value = value + return property(fget=fget, fset=fset) + scheme = Attribute(lambda term: 'scheme' if \ term == OPF('meta') else OPF('scheme'), - [DC('identifier'), OPF('meta')]) + [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) @@ -405,6 +453,7 @@ class Metadata(object): self.items = defaultdict(list) def add(self, term, value, attrib={}, nsmap={}, **kwargs): + """Add a new metadata item.""" item = self.Item(term, value, attrib, nsmap, **kwargs) items = self.items[barename(item.term)] items.append(item) @@ -477,8 +526,40 @@ class Metadata(object): class Manifest(object): + """Collection of files composing an OEB data model book. + + Provides access to the content of the files composing the book and + attributes associated with those files, including their internal paths, + unique identifiers, and MIME types. + + Itself acts as a :class:`set` of manifest items, and provides the following + instance data member for dictionary-like access: + + :attr:`ids`: A dictionary in which the keys are the unique identifiers of + the manifest items and the values are the items themselves. + :attr:`hrefs`: A dictionary in which the keys are the internal paths of the + manifest items and the values are the items themselves. + """ class Item(object): + """An OEB data model book content file. + + Provides the following data members for accessing the file content and + metadata associated with this particular file. + + :attr:`id`: Unique identifier. + :attr:`href`: Book-internal path. + :attr:`media_type`: MIME type of the file content. + :attr:`fallback`: Unique id of any fallback manifest item associated + with this manifest item. + :attr:`spine_position`: Display/reading order index for book textual + content. `None` for manifest items which are not part of the + book's textual content. + :attr:`linear`: `True` for textual content items which are part of the + primary linear reading order and `False` for textual content items + which are not (such as footnotes). Meaningless for items which + have a :attr:`spine_position` of `None`. + """ NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') @@ -584,6 +665,18 @@ class Manifest(object): @dynamic_property def data(self): + doc = """Provides MIME type sensitive access to the manifest + entry's associated content. + + - XHTML, HTML, and variant content is parsed as necessary to + convert and and return as an lxml.etree element in the XHTML + namespace. + - XML content is parsed and returned as an lxml.etree element. + - CSS and CSS-variant content is parsed and returned as a cssutils + CSS DOM stylesheet. + - All other content is returned as a :class:`str` object with no + special parsing. + """ def fget(self): if self._data is not None: return self._data @@ -600,7 +693,7 @@ class Manifest(object): self._data = value def fdel(self): self._data = None - return property(fget, fset, fdel) + return property(fget, fset, fdel, doc=doc) def __str__(self): data = self.data @@ -631,6 +724,9 @@ class Manifest(object): return cmp(skey, okey) def relhref(self, href): + """Convert the URL provided in :param:`href` from a book-absolute + reference to a reference relative to this manifest item. + """ if urlparse(href).scheme: return href if '/' not in self.href: @@ -649,6 +745,9 @@ class Manifest(object): return relhref def abshref(self, href): + """Convert the URL provided in :param:`href` from a reference + relative to this manifest item to a book-absolute reference. + """ if urlparse(href).scheme: return href path, frag = urldefrag(href) @@ -663,25 +762,46 @@ class Manifest(object): def __init__(self, oeb): self.oeb = oeb + self.items = set() self.ids = {} self.hrefs = {} def add(self, id, href, media_type, fallback=None, loader=None, data=None): + """Add a new item to the book manifest. + + The item's :param:`id`, :param:`href`, and :param:`media_type` are all + required. A :param:`fallback` item-id is required for any items with a + MIME type which is not one of the OPS core media types. Either the + item's data itself may be provided with :param:`data`, or a loader + function for the data may be provided with :param:`loader`, or the + item's data may latter be set manually via the :attr:`data` attribute. + """ item = self.Item( self.oeb, id, href, media_type, fallback, loader, data) + self.items.add(item) self.ids[item.id] = item self.hrefs[item.href] = item return item def remove(self, item): + """Removes :param:`item` from the manifest.""" if item in self.ids: item = self.ids[item] del self.ids[item.id] del self.hrefs[item.href] + self.items.remove(item) if item in self.oeb.spine: self.oeb.spine.remove(item) def generate(self, id=None, href=None): + """Generate a new unique identifier and/or internal path for use in + creating a new manifest item, using the provided :param:`id` and/or + :param:`href` as bases. + + Returns an two-tuple of the new id and path. If either :param:`id` or + :param:`href` are `None` then the corresponding item in the return + tuple will also be `None`. + """ if id is not None: base = id index = 1 @@ -698,26 +818,16 @@ class Manifest(object): return id, href def __iter__(self): - for id in self.ids: - yield id - - def __getitem__(self, id): - return self.ids[id] - - def values(self): - for item in self.ids.values(): + for item in self.items: yield item + values = __iter__ - def items(self): - for id, item in self.ids.items(): - yield id, item - - def __contains__(self, key): - return key in self.ids + def __contains__(self, item): + return item in self.items def to_opf1(self, parent=None): elem = element(parent, 'manifest') - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = OEB_DOC_MIME @@ -732,7 +842,7 @@ class Manifest(object): def to_opf2(self, parent=None): elem = element(parent, OPF('manifest')) - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = XHTML_MIME @@ -747,7 +857,13 @@ class Manifest(object): class Spine(object): - + """Collection of manifest items composing an OEB data model book's main + textual content. + + The spine manages which manifest items compose the book's main textual + content and the sequence in which they appear. Provides Python container + access as a list-like object. + """ def __init__(self, oeb): self.oeb = oeb self.items = [] @@ -762,12 +878,14 @@ class Spine(object): return linear def add(self, item, linear=None): + """Append :param:`item` to the end of the `Spine`.""" item.linear = self._linear(linear) item.spine_position = len(self.items) self.items.append(item) return item def insert(self, index, item, linear): + """Insert :param:`item` at position :param:`index` in the `Spine`.""" item.linear = self._linear(linear) item.spine_position = index self.items.insert(index, item) @@ -776,6 +894,7 @@ class Spine(object): return item def remove(self, item): + """Remove :param:`item` from the `Spine`.""" index = item.spine_position self.items.pop(index) for i in xrange(index, len(self.items)): @@ -813,9 +932,24 @@ class Spine(object): class Guide(object): + """Collection of references to standard frequently-occurring sections + within an OEB data model book. + + Provides dictionary-like access, in which the keys are the OEB reference + type identifiers and the values are `Reference` objects. + """ class Reference(object): - + """Reference to a standard book section. + + Provides the following instance data members: + + :attr:`type`: Reference type identifier, as chosen from the list + allowed in the OPF 2.0 specification. + :attr:`title`: Human-readable section title. + :attr:`href`: Book-internal URL of the referenced section. May include + a fragment identifier. + """ _TYPES_TITLES = [('cover', __('Cover')), ('title-page', __('Title Page')), ('toc', __('Table of Contents')), @@ -867,17 +1001,19 @@ class Guide(object): @dynamic_property def item(self): + doc = """The manifest item associated with this reference.""" def fget(self): path = urldefrag(self.href)[0] hrefs = self.oeb.manifest.hrefs return hrefs.get(path, None) - return property(fget=fget) + return property(fget=fget, doc=doc) def __init__(self, oeb): self.oeb = oeb self.refs = {} def add(self, type, title, href): + """Add a new reference to the `Guide`.""" ref = self.Reference(self.oeb, type, title, href) self.refs[type] = ref return ref @@ -925,8 +1061,19 @@ class Guide(object): return elem +# TODO: This needs beefing up to support the interface of toc.TOC class TOC(object): - # This needs beefing up to support the interface of toc.TOC + """Represents a hierarchical table of contents or navigation tree for + accessing arbitrary semantic sections within an OEB data model book. + + Acts as a node within the navigation tree. Provides list-like access to + sub-nodes. Provides the follow node instance data attributes: + + :attr:`title`: The title of this navigation node. + :attr:`href`: Book-internal URL referenced by this node. + :attr:`klass`: Optional semantic class referenced by this node. + :attr:`id`: Option unique identifier for this node. + """ def __init__(self, title=None, href=None, klass=None, id=None): self.title = title self.href = urlnormalize(href) if href else href @@ -935,17 +1082,26 @@ class TOC(object): self.nodes = [] def add(self, title, href, klass=None, id=None): + """Create and return a new sub-node of this node.""" node = TOC(title, href, klass, id) self.nodes.append(node) return node + def iter(self): + """Iterate over this node and all descendants in depth-first order.""" + yield self + for child in self.nodes: + for node in child.iter(): + yield node + def iterdescendants(self): - for node in self.nodes: - yield node - for child in node.iterdescendants(): - yield child + """Iterate over all descendant nodes in depth-first order.""" + for child in self.nodes: + for node in child.iter(): + yield node def __iter__(self): + """Iterate over all immediate child nodes.""" for node in self.nodes: yield node @@ -953,6 +1109,9 @@ class TOC(object): return self.nodes[index] def autolayer(self): + """Make sequences of children pointing to the same content file into + children of the first node referencing that file. + """ prev = None for node in list(self.nodes): if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]: @@ -961,10 +1120,12 @@ class TOC(object): else: prev = node - def depth(self, level=0): - if self.nodes: - return self.nodes[0].depth(level+1) - return level + def depth(self): + """The maximum depth of the navigation tree rooted at this node.""" + try: + return max(node.depth() for node in self.nodes) + 1 + except ValueError: + return 1 def to_opf1(self, tour): for node in self.nodes: @@ -989,12 +1150,34 @@ class TOC(object): class PageList(object): + """Collection of named "pages" to mapped positions within an OEB data model + book's textual content. + + Provides list-like access to the pages. + """ class Page(object): + """Represents a mapping between a page name and a position within + the book content. + + Provides the following instance data attributes: + + :attr:`name`: The name of this page. Generally a number. + :attr:`href`: Book-internal URL at which point this page begins. + :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly + labeled in print with small-case Roman numerals), 'normal' (for + standard pages, as commonly labeled in print with Arabic numerals), + or 'special' (for other pages, as commonly not labeled in any + fashion in print, such as the cover and title pages). + :attr:`klass`: Optional semantic class of this page. + :attr:`id`: Optional unique identifier for this page. + """ + TYPES = set(['front', 'normal', 'special']) + def __init__(self, name, href, type='normal', klass=None, id=None): - self.name = name + self.name = unicode(name) self.href = urlnormalize(href) - self.type = type + self.type = type if type in self.TYPES else 'normal' self.id = id self.klass = klass @@ -1002,6 +1185,7 @@ class PageList(object): self.pages = [] def add(self, name, href, type='normal', klass=None, id=None): + """Create a new page and add it to the `PageList`.""" page = self.Page(name, href, type, klass, id) self.pages.append(page) return page @@ -1015,6 +1199,12 @@ class PageList(object): def __getitem__(self, index): return self.pages[index] + + def pop(self, index=-1): + return self.pages.pop(index) + + def remove(self, page): + return self.pages.remove(page) def to_ncx(self, parent=None): plist = element(parent, NCX('pageList'), id=str(uuid.uuid4())) @@ -1040,8 +1230,33 @@ class PageList(object): class OEBBook(object): + """Representation of a book in the IDPF OEB data model.""" def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): + """Create empty book. Optional arguments: + + :param:`encoding`: Default encoding for textual content read + from an external container. + :param:`pretty_print`: Whether or not the canonical string form + of XML markup is pretty-printed. + :prama:`logger`: A Logger object to use for logging all messages + related to the processing of this book. It is accessible + via the instance data member :attr:`logger`. + + It provides the following public instance data members for + accessing various parts of the OEB data model: + + :attr:`metadata`: Metadata such as title, author name(s), etc. + :attr:`manifest`: Manifest of all files included in the book, + including MIME types and fallback information. + :attr:`spine`: In-order list of manifest items which compose + the textual content of the book. + :attr:`guide`: Collection of references to standard positions + within the text, such as the cover, preface, etc. + :attr:`toc`: Hierarchical table of contents. + :attr:`pages`: List of "pages," such as indexed to a print edition of + the same text. + """ self.encoding = encoding self.pretty_print = pretty_print self.logger = logger @@ -1057,16 +1272,19 @@ class OEBBook(object): @classmethod def generate(cls, opts): + """Generate an OEBBook instance from command-line options.""" encoding = opts.encoding pretty_print = opts.pretty_print return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): + """Translate :param:`text` into the book's primary language.""" lang = str(self.metadata.language[0]) lang = lang.split('-', 1)[0].lower() return translate(lang, text) def decode(self, data): + """Automatically decode :param:`data` into a `unicode` object.""" if isinstance(data, unicode): return data if data[:2] in ('\xff\xfe', '\xfe\xff'): @@ -1089,6 +1307,11 @@ class OEBBook(object): return data def to_opf1(self): + """Produce OPF 1.2 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ package = etree.Element('package', attrib={'unique-identifier': self.uid.id}) self.metadata.to_opf1(package) @@ -1160,6 +1383,11 @@ class OEBBook(object): return ncx def to_opf2(self, page_map=False): + """Produce OPF 2.0 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ results = {} package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': self.uid.id}, From 5dca63111427af5a8caddbff0d96a63b1bc9f5fe Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 11 Feb 2009 10:00:54 -0500 Subject: [PATCH 6/7] Demonstrable modularization of e-book conversion. --- src/calibre/ebooks/lit/reader.py | 1 + src/calibre/ebooks/mobi/mobiml.py | 10 ++- src/calibre/ebooks/mobi/writer.py | 40 +++++++-- src/calibre/ebooks/oeb/base.py | 11 +-- src/calibre/ebooks/oeb/factory.py | 87 +++++++++++++++++-- src/calibre/ebooks/oeb/reader.py | 24 +++++ src/calibre/ebooks/oeb/transforms/flatcss.py | 10 ++- src/calibre/ebooks/oeb/transforms/htmltoc.py | 13 ++- .../ebooks/oeb/transforms/manglecase.py | 10 ++- .../ebooks/oeb/transforms/rasterize.py | 10 ++- .../ebooks/oeb/transforms/trimmanifest.py | 10 ++- src/calibre/ebooks/oeb/writer.py | 57 +++--------- 12 files changed, 210 insertions(+), 73 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index dd42434101..8cbb9514a8 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -802,6 +802,7 @@ class LitFile(object): class LitContainer(object): + """Simple Container-interface, read-only accessor for LIT files.""" def __init__(self, filename_or_stream): self._litfile = LitFile(filename_or_stream) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 7ecd127452..b7418a5d19 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -82,7 +82,15 @@ class MobiMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb self.profile = profile = context.dest diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 380bdbf518..1b5d3ae652 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -295,6 +295,11 @@ class Serializer(object): class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') + + DEFAULT_PROFILE = 'CybookG3' + + TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer, + ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, prefer_author_sort=False): @@ -302,7 +307,32 @@ class MobiWriter(object): self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort - def dump(self, oeb, path): + @classmethod + def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ + mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.')) + mobi('compress', ['--compress'], default=False, + help=_('Compress file text using PalmDOC compression. ' + 'Results in smaller files, but takes a long time to run.')) + mobi('rescale_images', ['--rescale-images'], default=False, + help=_('Modify images to meet Palm device size limitations.')) + mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, + help=_('When present, use the author sorting information for ' + 'generating the Mobipocket author metadata.')) + return cfg + + @classmethod + def generate(cls, opts): + """Generate a Writer instance from command-line options.""" + compression = PALMDOC if opts.compress else UNCOMPRESSED + imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None + prefer_author_sort = opts.prefer_author_sort + return cls(compression=compression, imagemax=imagemax, + prefer_author_sort=prefer_author_sort) + + def __call__(self, oeb, path): if hasattr(path, 'write'): return self._dump_stream(oeb, path) with open(path, 'w+b') as stream: @@ -533,20 +563,12 @@ def config(defaults=None): c = StringConfig(defaults, desc) mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('compress', ['--compress'], default=False, - help=_('Compress file text using PalmDOC compression. ' - 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, - help=_('Modify images to meet Palm device size limitations.')) mobi('toc_title', ['--toc-title'], default=None, help=_('Title for any generated in-line table of contents.')) mobi('ignore_tables', ['--ignore-tables'], default=False, help=_('Render HTML tables as blocks of text instead of actual ' 'tables. This is neccessary if the HTML contains very large ' 'or complex tables.')) - mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, - help=_('When present, use the author sorting information for ' - 'generating the Mobipocket author metadata.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index ce16fa76e5..c9d01b03fe 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -820,8 +820,10 @@ class Manifest(object): def __iter__(self): for item in self.items: yield item - values = __iter__ + def values(self): + return list(self.items) + def __contains__(self, item): return item in self.items @@ -1134,7 +1136,7 @@ class TOC(object): node.to_opf1(tour) return tour - def to_ncx(self, parent, depth=1): + def to_ncx(self, parent): for node in self.nodes: id = node.id or unicode(uuid.uuid4()) attrib = {'id': id, 'playOrder': '0'} @@ -1143,9 +1145,8 @@ class TOC(object): point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) element(label, NCX('text')).text = node.title - href = node.href if depth > 1 else urldefrag(node.href)[0] - element(point, NCX('content'), src=href) - node.to_ncx(point, depth+1) + element(point, NCX('content'), src=node.href) + node.to_ncx(point) return parent diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py index 1ce33a4f00..684451044b 100644 --- a/src/calibre/ebooks/oeb/factory.py +++ b/src/calibre/ebooks/oeb/factory.py @@ -6,20 +6,93 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os +import sys, os, logging +from itertools import chain from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.reader import OEBReader +from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.lit.reader import LitReader +from calibre.ebooks.lit.writer import LitWriter +from calibre.ebooks.mobi.reader import MobiReader +from calibre.ebooks.mobi.writer import MobiWriter +from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.profile import Context +from calibre.utils.config import Config __all__ = ['get_reader'] -READER_REGISTRY = { - '.opf': OEBReader, - '.lit': LitReader, +REGISTRY = { + '.opf': (OEBReader, None), + '.lit': (LitReader, LitWriter), + '.mobi': (MobiReader, MobiWriter), } def ReaderFactory(path): - ext = os.path.splitext(path)[1].lower() - if not ext: + if os.path.isdir(path): return OEBReader - return READER_REGISTRY[ext]() + ext = os.path.splitext(path)[1].lower() + Reader = REGISTRY.get(ext, (None, None))[0] + if Reader is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Reader + +def WriterFactory(path): + if os.path.isdir(path): + return OEBWriter + ext = os.path.splitext(path)[1].lower() + if not os.path.exists(path) and not ext: + return OEBWriter + Writer = REGISTRY.get(ext, (None, None))[1] + if Writer is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Writer + + +def option_parser(Reader, Writer): + cfg = Config('ebook-convert', _('Options to control e-book conversion.')) + Reader.config(cfg) + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + Transform.config(cfg) + Writer.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for input. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def main(argv=sys.argv): + if len(argv) < 3: + print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]") + return 1 + inpath, outpath = argv[1], argv[2] + Reader = ReaderFactory(inpath) + Writer = WriterFactory(outpath) + parser = option_parser(Reader, Writer) + opts, args = parser.parse_args(argv[3:]) + if len(args) != 0: + parser.print_help() + return 1 + logger = Logger(logging.getLogger('ebook-convert')) + logger.setup_cli_handler(opts.verbose) + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE) + reader = Reader.generate(opts) + writer = Writer.generate(opts) + transforms = [] + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + transforms.append(Transform.generate(opts)) + reader(oeb, inpath) + for transform in transforms: + transform(oeb, context) + writer(oeb, outpath) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index aa23ce1e96..0fce1c2b0d 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -31,15 +31,39 @@ from calibre.ptempfile import TemporaryDirectory __all__ = ['OEBReader'] class OEBReader(object): + """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') Container = DirContainer + """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content read with this Reader.""" + + TRANSFORMS = [] + """List of transforms to apply to content read with this Reader.""" + + def __init__(self): + return + @classmethod + def config(cls, cfg): + """Add any book-reading options to the :class:`Config` object + :param:`cfg`. + """ + return + + @classmethod + def generate(cls, opts): + """Generate a Reader instance from command-line options.""" + return cls() + def __call__(self, oeb, path): + """Read the book at :param:`path` into the :class:`OEBBook` object + :param:`oeb`. + """ self.oeb = oeb self.logger = oeb.logger oeb.container = self.Container(path) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 01afcb08e2..ac9684a624 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -94,7 +94,15 @@ class CSSFlattener(object): self.unfloat = unfloat self.untable = untable - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.oeb = oeb self.context = context diff --git a/src/calibre/ebooks/oeb/transforms/htmltoc.py b/src/calibre/ebooks/oeb/transforms/htmltoc.py index 5508b58ec3..0040f39c14 100644 --- a/src/calibre/ebooks/oeb/transforms/htmltoc.py +++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py @@ -52,7 +52,18 @@ class HTMLTOCAdder(object): self.title = title self.style = style - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + group = cfg.add_group('htmltoc', _('HTML TOC generation options.')) + group('toc_title', ['--toc-title'], default=None, + help=_('Title for any generated in-line table of contents.')) + return cfg + + @classmethod + def generate(cls, opts): + return cls(title=opts.toc_title) + + def __call__(self, oeb, context): if 'toc' in oeb.guide: return oeb.logger.info('Generating in-line TOC...') diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index 3a3d91364f..c819475a4d 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -29,7 +29,15 @@ CASE_MANGLER_CSS = """ TEXT_TRANSFORMS = set(['capitalize', 'uppercase', 'lowercase']) class CaseMangler(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Applying case-transforming CSS...') self.oeb = oeb self.profile = context.source diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 12a2812898..aef5c2c98b 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -34,7 +34,15 @@ class SVGRasterizer(object): if QApplication.instance() is None: QApplication([]) - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb self.profile = context.dest diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index a1d28e5a99..a5e7042617 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -17,7 +17,15 @@ from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE from calibre.ebooks.oeb.base import urlnormalize class ManifestTrimmer(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Trimming unused files from manifest...') used = set() hrefs = oeb.manifest.hrefs diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index c84db30c98..235965b50f 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -9,13 +9,16 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from calibre.ebooks.oeb.base import OPF_MIME, xml2str from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook -from calibre.utils.config import Config __all__ = ['OEBWriter'] class OEBWriter(object): DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content written with this Writer.""" + TRANSFORMS = [] + """List of transforms to apply to content written with this Writer.""" + def __init__(self, version='2.0', page_map=False, pretty_print=False): self.version = version self.page_map = page_map @@ -23,6 +26,9 @@ class OEBWriter(object): @classmethod def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) versions = ['1.2', '2.0'] oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, @@ -34,6 +40,7 @@ class OEBWriter(object): @classmethod def generate(cls, opts): + """Generate a Writer instance from command-line options.""" version = opts.opf_version page_map = opts.adobe_page_map pretty_print = opts.pretty_print @@ -41,6 +48,9 @@ class OEBWriter(object): pretty_print=pretty_print) def __call__(self, oeb, path): + """Read the book in the :class:`OEBBook` object :param:`oeb` to a file + at :param:`path`. + """ version = int(self.version[0]) opfname = None if os.path.splitext(path)[1].lower() == '.opf': @@ -63,48 +73,3 @@ class OEBWriter(object): href = opfname output.write(href, xml2str(data, pretty_print=pretty_print)) return - - -def option_parser(): - cfg = Config('oeb', _('Options to control OEB conversion.')) - OEBWriter.config(cfg) - parser = cfg.option_parser() - parser.add_option('--encoding', default=None, - help=_('Character encoding for files. Default is to auto detect.')) - parser.add_option('-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option('-p', '--pretty-print', action='store_true', - default=False, help=_('Produce more human-readable XML output.')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def any2oeb(opts, inpath): - from calibre.ebooks.oeb.factory import ReaderFactory - logger = Logger(logging.getLogger('any2oeb')) - logger.setup_cli_handler(opts.verbose) - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] - encoding = opts.encoding - pretty_print = opts.pretty_print - oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) - reader = ReaderFactory(inpath) - reader(oeb, inpath) - writer = OEBWriter.generate(opts) - writer(oeb, outpath) - return 0 - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = any2oeb(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) From 459d350af3634a8ca1fbf1498f985c5a96ec325a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 11 Feb 2009 10:32:08 -0500 Subject: [PATCH 7/7] Pretty much full utility for LIT->MOBI direct conversion pipeline. --- src/calibre/ebooks/mobi/mobiml.py | 7 ++++++- src/calibre/ebooks/mobi/writer.py | 26 +++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index b7418a5d19..534366da7d 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -84,11 +84,16 @@ class MobiMLizer(object): @classmethod def config(cls, cfg): + group = cfg.add_group('mobiml', _('Mobipocket markup options.')) + group('ignore_tables', ['--ignore-tables'], default=False, + help=_('Render HTML tables as blocks of text instead of actual ' + 'tables. This is neccessary if the HTML contains very ' + 'large or complex tables.')) return cfg @classmethod def generate(cls, opts): - return cls() + return cls(ignore_tables=opts.ignore_tables) def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 1b5d3ae652..86ac6f6dc9 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -292,13 +292,28 @@ class Serializer(object): buffer.seek(hoff) buffer.write('%010d' % ioff) - + +class MobiFlattener(object): + def config(self, cfg): + return cfg + + def generate(self, opts): + return self + + def __call__(self, oeb, context): + fbase = context.dest.fbase + fkey = context.dest.fnums.values() + flattener = CSSFlattener( + fbase=fbase, fkey=fkey, unfloat=True, untable=True) + return flattener(oeb, context) + + class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') DEFAULT_PROFILE = 'CybookG3' - TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer, + TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer, ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, @@ -562,13 +577,6 @@ def config(defaults=None): else: c = StringConfig(defaults, desc) - mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('toc_title', ['--toc-title'], default=None, - help=_('Title for any generated in-line table of contents.')) - mobi('ignore_tables', ['--ignore-tables'], default=False, - help=_('Render HTML tables as blocks of text instead of actual ' - 'tables. This is neccessary if the HTML contains very large ' - 'or complex tables.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles))