From bd296fa43c8d7338b65af1c5ca7cfb02fc9c6daf Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 17 Jan 2009 11:18:14 -0500 Subject: [PATCH 01/13] Restore LitReader refactoring (again) --- src/calibre/ebooks/lit/reader.py | 363 +++++++++++++++++-------------- 1 file changed, 201 insertions(+), 162 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 461c067382..0e7f9a1ccf 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,20 +7,24 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, cStringIO, os +import sys, struct, os import functools import re from urlparse import urldefrag +from cStringIO import StringIO +from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 -from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.oeb.base import XML_PARSER, urlnormalize from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] msdes, msdeserror = plugins['msdes'] +__all__ = ["LitReader"] + XML_DECL = """ """ OPF_DECL = """ @@ -108,6 +112,9 @@ def consume_sized_utf8_string(bytes, zpad=False): pos += 1 return u''.join(result), bytes[pos:] +def encode(string): + return unicode(string).encode('ascii', 'xmlcharrefreplace') + class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') @@ -118,13 +125,13 @@ class UnBinary(object): def __init__(self, bin, path, manifest={}, map=HTML_MAP): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map - self.opf = map is OPF_MAP - self.bin = bin + self.is_html = map is HTML_MAP self.dir = os.path.dirname(path) - self.buf = cStringIO.StringIO() - self.binary_to_text() - self.raw = self.buf.getvalue().lstrip().decode('utf-8') + buf = StringIO() + self.binary_to_text(bin, buf) + self.raw = buf.getvalue().lstrip() self.escape_reserved() + self._tree = None def escape_reserved(self): raw = self.raw @@ -151,18 +158,28 @@ class UnBinary(object): return '/'.join(relpath) def __unicode__(self): + return self.raw.decode('utf-8') + + def __str__(self): return self.raw + + def tree(): + def fget(self): + if not self._tree: + self._tree = etree.fromstring(self.raw, parser=XML_PARSER) + return self._tree + return property(fget=fget) + tree = tree() - def binary_to_text(self, base=0, depth=0): + def binary_to_text(self, bin, buf, index=0, depth=0): tag_name = current_map = None dynamic_tag = errors = 0 in_censorship = is_goingdown = False state = 'text' - index = base flags = 0 - while index < len(self.bin): - c, index = read_utf8_char(self.bin, index) + while index < len(bin): + c, index = read_utf8_char(bin, index) oc = ord(c) if state == 'text': @@ -175,7 +192,7 @@ class UnBinary(object): c = '>>' elif c == '<': c = '<<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) elif state == 'get flags': if oc == 0: @@ -188,7 +205,7 @@ class UnBinary(object): state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc - self.buf.write('<') + buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: @@ -205,7 +222,7 @@ class UnBinary(object): tag_name = '?'+unichr(tag)+'?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag') @@ -217,15 +234,14 @@ class UnBinary(object): if not is_goingdown: tag_name = None dynamic_tag = 0 - self.buf.write(' />') + buf.write(' />') else: - self.buf.write('>') - index = self.binary_to_text(base=index, depth=depth+1) + buf.write('>') + index = self.binary_to_text(bin, buf, index, depth+1) is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write(u''.join( - ('')).encode('utf-8')) + buf.write(encode(u''.join(('')))) dynamic_tag = 0 tag_name = None state = 'text' @@ -245,7 +261,7 @@ class UnBinary(object): in_censorship = True state = 'get value length' continue - self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') + buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: @@ -253,40 +269,39 @@ class UnBinary(object): elif state == 'get value length': if not in_censorship: - self.buf.write('"') + buf.write('"') count = oc - 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue - if count < 0 or count > (len(self.bin) - index): + if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: - self.buf.write('%s"' % (oc - 1)) + buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(c.encode( - 'ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) count -= 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 - if count <= 0 or count > len(self.bin)-index: + if count <= 0 or count > len(bin)-index: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' @@ -296,26 +311,26 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - self.buf.write(' ') + buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': - self.buf.write(unicode(c).encode('utf-8')) + buf.write(encode(c)) count -= 1 if count == 0: - self.buf.write('=') + buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' @@ -329,10 +344,11 @@ class UnBinary(object): if frag: path = '#'.join((path, frag)) path = urlnormalize(path) - self.buf.write((u'"%s"' % path).encode('utf-8')) + buf.write(encode(u'"%s"' % path)) state = 'get attr' return index + class DirectoryEntry(object): def __init__(self, name, section, offset, size): self.name = name @@ -347,6 +363,7 @@ class DirectoryEntry(object): def __str__(self): return repr(self) + class ManifestItem(object): def __init__(self, original, internal, mime_type, offset, root, state): self.original = original @@ -374,65 +391,87 @@ class ManifestItem(object): % (self.internal, self.path, self.mime_type, self.offset, self.root, self.state) + def preserve(function): def wrapper(self, *args, **kwargs): - opos = self._stream.tell() + opos = self.stream.tell() try: return function(self, *args, **kwargs) finally: - self._stream.seek(opos) + self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper -class LitReader(object): +class LitFile(object): PIECE_SIZE = 16 - XML_PARSER = etree.XMLParser( - recover=True, resolve_entities=False) + + def __init__(self, filename_or_stream): + if hasattr(filename_or_stream, 'read'): + self.stream = filename_or_stream + else: + self.stream = open(filename_or_stream, 'rb') + try: + self.opf_path = os.path.splitext( + os.path.basename(self.stream.name))[0] + '.opf' + except AttributeError: + self.opf_path = 'content.opf' + if self.magic != 'ITOLITLS': + raise LitError('Not a valid LIT file') + if self.version != 1: + raise LitError('Unknown LIT version %d' % (self.version,)) + self.read_secondary_header() + self.read_header_pieces() + self.read_section_names() + self.read_manifest() + self.read_drm() + + def warn(self, msg): + print "WARNING: %s" % (msg,) def magic(): @preserve def fget(self): - self._stream.seek(0) - return self._stream.read(8) + self.stream.seek(0) + return self.stream.read(8) return property(fget=fget) magic = magic() def version(): def fget(self): - self._stream.seek(8) - return u32(self._stream.read(4)) + self.stream.seek(8) + return u32(self.stream.read(4)) return property(fget=fget) version = version() def hdr_len(): @preserve def fget(self): - self._stream.seek(12) - return int32(self._stream.read(4)) + self.stream.seek(12) + return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() def num_pieces(): @preserve def fget(self): - self._stream.seek(16) - return int32(self._stream.read(4)) + self.stream.seek(16) + return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() def sec_hdr_len(): @preserve def fget(self): - self._stream.seek(20) - return int32(self._stream.read(4)) + self.stream.seek(20) + return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() def guid(): @preserve def fget(self): - self._stream.seek(24) - return self._stream.read(16) + self.stream.seek(24) + return self.stream.read(16) return property(fget=fget) guid = guid() @@ -442,44 +481,27 @@ class LitReader(object): size = self.hdr_len \ + (self.num_pieces * self.PIECE_SIZE) \ + self.sec_hdr_len - self._stream.seek(0) - return self._stream.read(size) + self.stream.seek(0) + return self.stream.read(size) return property(fget=fget) header = header() - def __init__(self, filename_or_stream): - if hasattr(filename_or_stream, 'read'): - self._stream = filename_or_stream - else: - self._stream = open(filename_or_stream, 'rb') - if self.magic != 'ITOLITLS': - raise LitError('Not a valid LIT file') - if self.version != 1: - raise LitError('Unknown LIT version %d' % (self.version,)) - self.entries = {} - self._read_secondary_header() - self._read_header_pieces() - self._read_section_names() - self._read_manifest() - self._read_meta() - self._read_drm() - @preserve def __len__(self): - self._stream.seek(0, 2) - return self._stream.tell() + self.stream.seek(0, 2) + return self.stream.tell() @preserve - def _read_raw(self, offset, size): - self._stream.seek(offset) - return self._stream.read(size) + def read_raw(self, offset, size): + self.stream.seek(offset) + return self.stream.read(size) - def _read_content(self, offset, size): - return self._read_raw(self.content_offset + offset, size) + def read_content(self, offset, size): + return self.read_raw(self.content_offset + offset, size) - def _read_secondary_header(self): + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) - bytes = self._read_raw(offset, self.sec_hdr_len) + bytes = self.read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset+4] @@ -507,21 +529,21 @@ class LitReader(object): if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - def _read_header_pieces(self): + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) - piece = self._read_raw(offset, size) + piece = self.read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') - self._read_directory(piece) + self.read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: @@ -532,12 +554,13 @@ class LitReader(object): elif i == 4: self.piece4_guid = piece - def _read_directory(self, piece): + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) if (32 + (num_chunks * chunk_size)) != len(piece): - raise LitError('IFCM HEADER has incorrect length') + raise LitError('IFCM header has incorrect length') + self.entries = {} for i in xrange(num_chunks): offset = 32 + (i * chunk_size) chunk = piece[offset:offset + chunk_size] @@ -571,17 +594,17 @@ class LitReader(object): entry = DirectoryEntry(name, section, offset, size) self.entries[name] = entry - def _read_section_names(self): + def read_section_names(self): if '::DataSpace/NameList' not in self.entries: raise LitError('Lit file does not have a valid NameList') raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 - self.num_sections = u16(raw[2:pos]) - self.section_names = [""]*self.num_sections - self.section_data = [None]*self.num_sections - for section in xrange(self.num_sections): + num_sections = u16(raw[2:pos]) + self.section_names = [""] * num_sections + self.section_data = [None] * num_sections + for section in xrange(num_sections): size = u16(raw[pos:pos+2]) pos += 2 size = size*2 + 2 @@ -591,11 +614,12 @@ class LitReader(object): raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size - def _read_manifest(self): + def read_manifest(self): if '/manifest' not in self.entries: raise LitError('Lit file does not have a valid manifest') raw = self.get_file('/manifest') self.manifest = {} + self.paths = {self.opf_path: None} while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break @@ -634,28 +658,9 @@ class LitReader(object): for item in mlist: if item.path[0] == '/': item.path = os.path.basename(item.path) + self.paths[item.path] = item - def _pretty_print(self, xml): - f = cStringIO.StringIO(xml.encode('utf-8')) - doc = etree.parse(f, parser=self.XML_PARSER) - pretty = etree.tostring(doc, encoding='ascii', pretty_print=True) - return XML_DECL + unicode(pretty) - - def _read_meta(self): - path = 'content.opf' - raw = self.get_file('/meta') - xml = OPF_DECL - try: - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - except LitError: - if 'PENGUIN group' not in raw: raise - print "WARNING: attempting PENGUIN malformed OPF fix" - raw = raw.replace( - 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - self.meta = xml - - def _read_drm(self): + def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 @@ -666,7 +671,7 @@ class LitReader(object): else: return if self.drmlevel < 5: - msdes.deskey(self._calculate_deskey(), msdes.DE1) + msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') @@ -674,7 +679,7 @@ class LitReader(object): else: raise DRMError("Cannot access DRM-protected book") - def _calculate_deskey(self): + def calculate_deskey(self): hashfiles = ['/meta', '/DRMStorage/DRMSource'] if self.drmlevel == 3: hashfiles.append('/DRMStorage/DRMBookplate') @@ -698,18 +703,18 @@ class LitReader(object): def get_file(self, name): entry = self.entries[name] if entry.section == 0: - return self._read_content(entry.offset, entry.size) + return self.read_content(entry.offset, entry.size) section = self.get_section(entry.section) return section[entry.offset:entry.offset+entry.size] def get_section(self, section): data = self.section_data[section] if not data: - data = self._get_section(section) + data = self.get_section_uncached(section) self.section_data[section] = data return data - def _get_section(self, section): + def get_section_uncached(self, section): name = self.section_names[section] path = '::DataSpace/Storage/' + name transform = self.get_file(path + '/Transform/List') @@ -721,29 +726,29 @@ class LitReader(object): raise LitError("ControlData is too short") guid = msguid(transform) if guid == DESENCRYPT_GUID: - content = self._decrypt(content) + content = self.decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file( '/'.join(('::DataSpace/Storage', name, 'Transform', LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) - content = self._decompress(content, control, reset_table) + content = self.decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) transform = transform[16:] return content - def _decrypt(self, content): + def decrypt(self, content): length = len(content) extra = length & 0x7 if extra > 0: - self._warn("content length not a multiple of block size") + self.warn("content length not a multiple of block size") content += "\0" * (8 - extra) msdes.deskey(self.bookkey, msdes.DE1) return msdes.des(content) - def _decompress(self, content, control, reset_table): + def decompress(self, content, control, reset_table): if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): @@ -784,7 +789,7 @@ class LitReader(object): result.append( lzx.decompress(content[base:size], window_bytes)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) @@ -794,55 +799,88 @@ class LitReader(object): try: result.append(lzx.decompress(content[base:], bytes_remaining)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") return ''.join(result) - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - content = decl + unicode(UnBinary(raw, path, self.manifest, map)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') - else: - name = '/'.join(('/data', entry.internal)) - content = self.get_file(name) - return content - - def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): - output_dir = os.path.abspath(output_dir) - try: - opf_path = os.path.splitext( - os.path.basename(self._stream.name))[0] + '.opf' - except AttributeError: - opf_path = 'content.opf' - opf_path = os.path.join(output_dir, opf_path) - self._ensure_dir(opf_path) - with open(opf_path, 'wb') as f: - xml = self.meta - if pretty_print: - xml = self._pretty_print(xml) - f.write(xml.encode('utf-8')) - for entry in self.manifest.values(): - path = os.path.join(output_dir, entry.path) - self._ensure_dir(path) - with open(path, 'wb') as f: - f.write(self.get_entry_content(entry, pretty_print)) +class LitReader(object): + def __init__(self, filename_or_stream): + self._litfile = LitFile(filename_or_stream) + + def namelist(self): + return self._litfile.paths.keys() + + def exists(self, name): + return urlunquote(name) in self._litfile.paths + + def read_xml(self, name): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + content = self._read_meta() + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = unbin.tree + else: + raise LitError('Requested non-XML content as XML') + return content + + def read(self, name, pretty_print=False): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + meta = self._read_meta() + content = OPF_DECL + etree.tostring( + meta, encoding='ascii', pretty_print=pretty_print) + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = HTML_DECL + if pretty_print: + content += etree.tostring(unbin.tree, + encoding='ascii', pretty_print=True) + else: + content += str(unbin) + else: + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) + return content + + def meta(): + def fget(self): + return self.read(self._litfile.opf_path) + return property(fget=fget) + meta = meta() + def _ensure_dir(self, path): dir = os.path.dirname(path) if not os.path.isdir(dir): os.makedirs(dir) + + def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): + for name in self.namelist(): + path = os.path.join(output_dir, name) + self._ensure_dir(path) + with open(path, 'wb') as f: + f.write(self.read(name, pretty_print=pretty_print)) + + def _read_meta(self): + path = 'content.opf' + raw = self._litfile.get_file('/meta') + try: + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + except LitError: + if 'PENGUIN group' not in raw: raise + print "WARNING: attempting PENGUIN malformed OPF fix" + raw = raw.replace( + 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + return unbin.tree - def _warn(self, msg): - print "WARNING: %s" % (msg,) def option_parser(): from calibre.utils.config import OptionParser @@ -852,7 +890,8 @@ def option_parser(): help=_('Output directory. Defaults to current directory.')) parser.add_option( '-p', '--pretty-print', default=False, action='store_true', - help=_('Legibly format extracted markup. May modify meaningful whitespace.')) + help=_('Legibly format extracted markup.' \ + ' May modify meaningful whitespace.')) parser.add_option( '--verbose', default=False, action='store_true', help=_('Useful for debugging.')) From cba3bb55e4108842d9e10ff5d9cc75e2f15b0361 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sat, 17 Jan 2009 14:43:16 -0500 Subject: [PATCH 02/13] Minor clean-ups to CSS flattening --- src/calibre/ebooks/oeb/transforms/flatcss.py | 31 ++++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 7110c2db2d..375003c1a5 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -144,7 +144,8 @@ class CSSFlattener(object): value = round(value / slineh) * dlineh cssdict[property] = "%0.5fem" % (value / fsize) - def flatten_node(self, node, stylizer, names, styles, psize, left=0): + def flatten_node(self, node, stylizer, names, styles, psize, left=0, + valigned=False): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: return @@ -154,18 +155,6 @@ class CSSFlattener(object): if 'align' in node.attrib: cssdict['text-align'] = node.attrib['align'] del node.attrib['align'] - if node.tag == XHTML('font'): - node.tag = XHTML('span') - if 'size' in node.attrib: - size = node.attrib['size'].strip() - if size: - fnums = self.context.source.fnums - if size[0] in ('+', '-'): - # Oh, the warcrimes - cssdict['font-size'] = fnums[3+int(size)] - else: - cssdict['font-size'] = fnums[int(size)] - del node.attrib['size'] if 'color' in node.attrib: cssdict['color'] = node.attrib['color'] del node.attrib['color'] @@ -173,7 +162,7 @@ class CSSFlattener(object): cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] if cssdict: - if 'font-size' in cssdict: + if 'font-size' in cssdict or tag == 'body': fsize = self.fmap[style['font-size']] cssdict['font-size'] = "%0.5fem" % (fsize / psize) psize = fsize @@ -197,10 +186,13 @@ class CSSFlattener(object): cssdict['display'] = 'inline' else: cssdict['display'] = 'block' - if 'vertical-align' in cssdict \ - and cssdict['vertical-align'] == 'sup': - cssdict['vertical-align'] = 'super' - if self.lineh and 'line-height' not in cssdict: + if 'vertical-align' in cssdict: + if cssdict['vertical-align'] == 'sup': + cssdict['vertical-align'] = 'text-top' + if style['vertical-align'] != 'baseline': + cssdict['line-height'] = '0' + valigned = True + if self.lineh and 'line-height' not in cssdict and not valigned: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if cssdict: @@ -220,7 +212,8 @@ class CSSFlattener(object): if 'style' in node.attrib: del node.attrib['style'] for child in node: - self.flatten_node(child, stylizer, names, styles, psize, left) + self.flatten_node(child, stylizer, names, styles, psize, left, + valigned) def flatten_head(self, item, stylizer, href): html = item.data From 76de6aef24f99929957676fde5e98f86f209345b Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 18 Jan 2009 21:44:43 -0500 Subject: [PATCH 03/13] Use etree.html to handle HTML entities and not UTF-8 encodings --- src/calibre/ebooks/oeb/base.py | 20 +++++++------------- src/calibre/ebooks/oeb/transforms/flatcss.py | 8 ++++---- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 4248657e23..a903136610 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote import logging import re -import htmlentitydefs import uuid import copy from lxml import etree +from lxml import html from calibre import LoggingInterface XML_PARSER = etree.XMLParser(recover=True) @@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) MS_COVER_TYPE = 'other.ms-coverimage-standard' -recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace') -ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items()) -del ENTITYDEFS['lt'] -del ENTITYDEFS['gt'] -del ENTITYDEFS['quot'] -del ENTITYDEFS['amp'] -del recode - def element(parent, *args, **kwargs): if parent is not None: @@ -298,7 +290,6 @@ class Metadata(object): class Manifest(object): class Item(object): - ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') def __init__(self, id, href, media_type, @@ -317,9 +308,12 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = self.ENTITY_RE.sub(repl, data) - data = etree.fromstring(data, parser=XML_PARSER) + try: + data = etree.fromstring(data, parser=XML_PARSER) + except etree.XMLSyntaxError: + data = html.fromstring(data, parser=XML_PARSER) + data = etree.tostring(data, encoding=unicode) + data = etree.fromstring(data, parser=XML_PARSER) if namespace(data.tag) != XHTML_NS: data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 375003c1a5..4877b28f51 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -161,11 +161,11 @@ class CSSFlattener(object): if 'bgcolor' in node.attrib: cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] + if 'font-size' in cssdict or tag == 'body': + fsize = self.fmap[style['font-size']] + cssdict['font-size'] = "%0.5fem" % (fsize / psize) + psize = fsize if cssdict: - if 'font-size' in cssdict or tag == 'body': - fsize = self.fmap[style['font-size']] - cssdict['font-size'] = "%0.5fem" % (fsize / psize) - psize = fsize if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) margin = style['margin-left'] From a9f4ab2346c78c63d60478036c4ddec0ececdf46 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 5 Feb 2009 17:00:04 -0800 Subject: [PATCH 04/13] Minor fixes --- src/calibre/ebooks/lrf/meta.py | 7 ++++++- src/calibre/ebooks/metadata/__init__.py | 13 ++++++++----- src/calibre/ebooks/metadata/cli.py | 21 +++++++++++++++++---- src/calibre/ebooks/metadata/opf.xml | 6 +++--- src/calibre/ebooks/metadata/opf2.py | 23 ++++++++++++++++++++++- 5 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/calibre/ebooks/lrf/meta.py b/src/calibre/ebooks/lrf/meta.py index 331e101ddd..322835f470 100644 --- a/src/calibre/ebooks/lrf/meta.py +++ b/src/calibre/ebooks/lrf/meta.py @@ -229,6 +229,9 @@ def get_metadata(stream): mi.author = lrf.author.strip() mi.comments = lrf.free_text.strip() mi.category = lrf.category.strip()+', '+lrf.classification.strip() + tags = [x.strip() for x in mi.category.split(',') if x.strip()] + if tags: + mi.tags = tags mi.publisher = lrf.publisher.strip() mi.cover_data = lrf.get_cover() try: @@ -624,7 +627,9 @@ def set_metadata(stream, mi): lrf.title = mi.title if mi.authors: lrf.author = ', '.join(mi.authors) - if mi.category: + if mi.tags: + lrf.category = mi.tags[0] + if getattr(mi, 'category', False): lrf.category = mi.category if mi.comments: lrf.free_text = mi.comments diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 063e56190b..e3c434342a 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -185,7 +185,7 @@ class MetaInformation(object): @staticmethod def copy(mi): ans = MetaInformation(mi.title, mi.authors) - for attr in ('author_sort', 'title_sort', 'comments', + for attr in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', 'isbn', 'tags', 'cover_data', 'application_id', 'guide', 'manifest', 'spine', 'toc', 'cover', 'language', 'book_producer'): @@ -210,7 +210,7 @@ class MetaInformation(object): #: mi.cover_data = (ext, data) self.cover_data = getattr(mi, 'cover_data', (None, None)) - for x in ('author_sort', 'title_sort', 'comments', 'publisher', + for x in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', 'isbn', 'language', 'application_id', 'manifest', 'toc', 'spine', 'guide', 'cover', 'book_producer', @@ -228,7 +228,7 @@ class MetaInformation(object): if mi.authors and mi.authors[0] != _('Unknown'): self.authors = mi.authors - for attr in ('author_sort', 'title_sort', 'comments', + for attr in ('author_sort', 'title_sort', 'comments', 'category', 'publisher', 'series', 'series_index', 'rating', 'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover', 'language', 'guide', 'book_producer'): @@ -251,10 +251,11 @@ class MetaInformation(object): return '%d'%x if int(x) == x else '%.2f'%x def __unicode__(self): - ans = [ fmt('Title', self.title) ] + ans = [] def fmt(x, y): ans.append(u'%-20s: %s'%(unicode(x), unicode(y))) - + + fmt('Title', self.title) if self.title_sort: fmt('Title sort', self.title_sort) if self.authors: @@ -264,6 +265,8 @@ class MetaInformation(object): fmt('Publisher', self.publisher) if getattr(self, 'book_producer', False): fmt('Book Producer', self.book_producer) + if self.category: + ans += u'Category : ' + unicode(self.category) + u'\n' if self.comments: fmt('Comments', self.comments) if self.isbn: diff --git a/src/calibre/ebooks/metadata/cli.py b/src/calibre/ebooks/metadata/cli.py index 75b541d9c9..4101f34047 100644 --- a/src/calibre/ebooks/metadata/cli.py +++ b/src/calibre/ebooks/metadata/cli.py @@ -26,6 +26,7 @@ from calibre.customize.ui import metadata_readers, metadata_writers from calibre.ebooks.metadata.meta import get_metadata, set_metadata from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string, \ title_sort, MetaInformation +from calibre.ebooks.lrf.meta import LRFMetaFile from calibre import prints def config(): @@ -50,6 +51,8 @@ def config(): help=_('Set the ebook description.')) c.add_opt('publisher', ['-p', '--publisher'], help=_('Set the ebook publisher.')) + c.add_opt('category', ['--category'], + help=_('Set the book category.')) c.add_opt('series', ['-s', '--series'], help=_('Set the series this ebook belongs to.')) c.add_opt('series_index', ['-i', '--index'], @@ -75,6 +78,9 @@ def config(): help=_('Read metadata from the specified OPF file and use it to ' 'set metadata in the ebook. Metadata specified on the' 'command line will override metadata read from the OPF file')) + + c.add_opt('lrf_bookid', ['--lrf-bookid'], + help=_('Set the BookID in LRF files')) return c def filetypes(): @@ -102,12 +108,12 @@ def do_set_metadata(opts, mi, stream, stream_type): for pref in config().option_set.preferences: if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort', - 'author_sort', 'get_cover', 'cover', 'tags'): + 'author_sort', 'get_cover', 'cover', 'tags', + 'lrf_bookid'): continue val = getattr(opts, pref.name, None) if val is not None: - setattr(mi, pref.name, getattr()) - + setattr(mi, pref.name, val) if getattr(opts, 'authors', None) is not None: mi.authors = string_to_authors(opts.authors) mi.author_sort = authors_to_sort_string(mi.authors) @@ -158,11 +164,18 @@ def main(args=sys.argv): do_set_metadata(opts, mi, stream, stream_type) stream.seek(0) stream.flush() + lrf = None + if stream_type == 'lrf': + if opts.lrf_bookid is not None: + lrf = LRFMetaFile(stream) + lrf.book_id = opts.lrf_bookid mi = get_metadata(stream, stream_type) - prints(_('Changed metadata')+'::') + prints('\n' + _('Changed metadata') + '::') metadata = unicode(mi) metadata = '\t'+'\n\t'.join(metadata.split('\n')) prints(metadata) + if lrf is not None: + prints('\tBookID:', lrf.book_id) if opts.to_opf is not None: from calibre.ebooks.metadata.opf2 import OPFCreator diff --git a/src/calibre/ebooks/metadata/opf.xml b/src/calibre/ebooks/metadata/opf.xml index d95268f306..703e82b5c1 100644 --- a/src/calibre/ebooks/metadata/opf.xml +++ b/src/calibre/ebooks/metadata/opf.xml @@ -6,13 +6,13 @@ > - ${mi.title} + ${mi.title} ${author} - ${'%s (%s)'%(__appname__, __version__)} [http://${__appname__}.kovidgoyal.net] + ${'%s (%s)'%(__appname__, __version__)} [http://${__appname__}.kovidgoyal.net] ${mi.application_id} ${mi.language if mi.language else 'UND'} - ${mi.category} + ${mi.category} ${mi.comments} ${mi.publisher} ${mi.isbn} diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 718d615e71..f051ad8568 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -414,6 +414,7 @@ class OPF(object): metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') metadata_elem_path = XPath('descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") and re:match(@name, concat("^calibre:", $name, "$"), "i"))]') + title_path = XPath('descendant::*[re:match(name(), "title", "i")]') authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]') bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]') tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]') @@ -503,7 +504,7 @@ class OPF(object): def set_text(self, elem, content): if elem.tag == self.META: - elem.attib['content'] = content + elem.attrib['content'] = content else: elem.text = content @@ -645,6 +646,26 @@ class OPF(object): return property(fget=fget, fset=fset) + @apply + def title_sort(): + + def fget(self): + matches = self.title_path(self.metadata) + if matches: + for match in matches: + ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None) + if not ans: + ans = match.get('file-as', None) + if ans: + return ans + + def fset(self, val): + matches = self.title_path(self.metadata) + if matches: + matches[0].set('file-as', unicode(val)) + + return property(fget=fget, fset=fset) + @apply def tags(): From 972e2161c71e3308ed0b2dd1d277c21ab0a2af12 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 6 Feb 2009 23:35:51 -0800 Subject: [PATCH 05/13] Remove use of the apply builtin as it is deprecated --- src/calibre/__init__.py | 4 +- src/calibre/devices/libusb.py | 10 ++--- src/calibre/devices/prs500/books.py | 18 ++++---- src/calibre/devices/prs500/cli/main.py | 26 +++++------ src/calibre/devices/prs500/prstypes.py | 57 ++++++++++++------------- src/calibre/devices/prs505/books.py | 18 ++++---- src/calibre/devices/usbms/books.py | 9 ++-- src/calibre/ebooks/html.py | 10 ++--- src/calibre/ebooks/lrf/tags.py | 16 +++---- src/calibre/ebooks/metadata/opf.py | 14 +++--- src/calibre/ebooks/metadata/opf2.py | 42 +++++++++--------- src/calibre/ebooks/metadata/toc.py | 6 +-- src/calibre/ebooks/oeb/base.py | 26 +++++------ src/calibre/gui2/viewer/documentview.py | 52 +++++++++++----------- src/calibre/library/database.py | 6 +-- src/calibre/library/database2.py | 6 +-- 16 files changed, 160 insertions(+), 160 deletions(-) diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index fe140df032..cb3c05c7b9 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -2,7 +2,9 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, re, logging, time, subprocess, atexit, mimetypes +import sys, os, re, logging, time, subprocess, atexit, mimetypes, \ + __builtin__ +__builtin__.__dict__['dynamic_property'] = lambda(func): func(None) from htmlentitydefs import name2codepoint from math import floor from logging import Formatter diff --git a/src/calibre/devices/libusb.py b/src/calibre/devices/libusb.py index 226a99f239..09261e10c5 100644 --- a/src/calibre/devices/libusb.py +++ b/src/calibre/devices/libusb.py @@ -116,8 +116,8 @@ class Device(Structure): raise Error("Cannot open device") return handle.contents - @apply - def configurations(): + @dynamic_property + def configurations(self): doc = """ List of device configurations. See L{ConfigDescriptor} """ def fget(self): ans = [] @@ -127,8 +127,8 @@ class Device(Structure): return property(doc=doc, fget=fget) class Bus(Structure): - @apply - def device_list(): + @dynamic_property + def device_list(self): doc = \ """ Flat list of devices on this bus. @@ -360,4 +360,4 @@ def get_devices(): for dev in devices: device = (dev.device_descriptor.idVendor, dev.device_descriptor.idProduct, dev.device_descriptor.bcdDevice) ans.append(device) - return ans + return ans \ No newline at end of file diff --git a/src/calibre/devices/prs500/books.py b/src/calibre/devices/prs500/books.py index 6c57920487..d567511ec6 100644 --- a/src/calibre/devices/prs500/books.py +++ b/src/calibre/devices/prs500/books.py @@ -55,8 +55,8 @@ class Book(object): size = book_metadata_field("size", formatter=int) # When setting this attribute you must use an epoch datetime = book_metadata_field("date", formatter=strptime, setter=strftime) - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): src = self.elem.getAttribute('titleSorter').strip() @@ -67,8 +67,8 @@ class Book(object): self.elem.setAttribute('titleSorter', sortable_title(unicode(val))) return property(doc=doc, fget=fget, fset=fset) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): doc = \ """ The thumbnail. Should be a height 68 image. @@ -88,15 +88,15 @@ class Book(object): return decode(rc) return property(fget=fget, doc=doc) - @apply - def path(): + @dynamic_property + def path(self): doc = """ Absolute path to book on device. Setting not supported. """ def fget(self): return self.root + self.rpath return property(fget=fget, doc=doc) - @apply - def db_id(): + @dynamic_property + def db_id(self): doc = '''The database id in the application database that this file corresponds to''' def fget(self): match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0]) @@ -378,4 +378,4 @@ class BookList(_BookList): def write(self, stream): """ Write XML representation of DOM tree to C{stream} """ - stream.write(self.document.toxml('utf-8')) + stream.write(self.document.toxml('utf-8')) \ No newline at end of file diff --git a/src/calibre/devices/prs500/cli/main.py b/src/calibre/devices/prs500/cli/main.py index dfd3eb1ed6..4a94bf41af 100755 --- a/src/calibre/devices/prs500/cli/main.py +++ b/src/calibre/devices/prs500/cli/main.py @@ -39,8 +39,8 @@ class FileFormatter(object): self.name = file.name self.path = file.path - @apply - def mode_string(): + @dynamic_property + def mode_string(self): doc=""" The mode string for this file. There are only two modes read-only and read-write """ def fget(self): mode, x = "-", "-" @@ -50,8 +50,8 @@ class FileFormatter(object): return mode return property(doc=doc, fget=fget) - @apply - def isdir_name(): + @dynamic_property + def isdir_name(self): doc='''Return self.name + '/' if self is a directory''' def fget(self): name = self.name @@ -61,8 +61,8 @@ class FileFormatter(object): return property(doc=doc, fget=fget) - @apply - def name_in_color(): + @dynamic_property + def name_in_color(self): doc=""" The name in ANSI text. Directories are blue, ebooks are green """ def fget(self): cname = self.name @@ -75,22 +75,22 @@ class FileFormatter(object): return cname return property(doc=doc, fget=fget) - @apply - def human_readable_size(): + @dynamic_property + def human_readable_size(self): doc=""" File size in human readable form """ def fget(self): return human_readable(self.size) return property(doc=doc, fget=fget) - @apply - def modification_time(): + @dynamic_property + def modification_time(self): doc=""" Last modified time in the Linux ls -l format """ def fget(self): return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.wtime)) return property(doc=doc, fget=fget) - @apply - def creation_time(): + @dynamic_property + def creation_time(self): doc=""" Last modified time in the Linux ls -l format """ def fget(self): return time.strftime("%Y-%m-%d %H:%M", time.localtime(self.ctime)) @@ -334,4 +334,4 @@ def main(): return 0 if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/src/calibre/devices/prs500/prstypes.py b/src/calibre/devices/prs500/prstypes.py index 4e1294fc1c..3efbfcab31 100755 --- a/src/calibre/devices/prs500/prstypes.py +++ b/src/calibre/devices/prs500/prstypes.py @@ -284,8 +284,8 @@ class Command(TransferBuffer): # Length of the data part of this packet length = field(start=12, fmt=DWORD) - @apply - def data(): + @dynamic_property + def data(self): doc = \ """ The data part of this command. Returned/set as/by a TransferBuffer. @@ -447,8 +447,8 @@ class LongCommand(Command): self.length = 16 self.command = command - @apply - def command(): + @dynamic_property + def command(self): doc = \ """ Usually carries extra information needed for the command @@ -568,8 +568,8 @@ class FileOpen(PathCommand): PathCommand.__init__(self, path, FileOpen.NUMBER, path_len_at_byte=20) self.mode = mode - @apply - def mode(): + @dynamic_property + def mode(self): doc = \ """ The file open mode. Is either L{FileOpen.READ} @@ -651,8 +651,8 @@ class Response(Command): raise PacketError("Response packets must have their number set to " \ + hex(0x00001000)) - @apply - def data(): + @dynamic_property + def data(self): doc = \ """ The last 3 DWORDs (12 bytes) of data in this @@ -681,43 +681,43 @@ class ListResponse(Response): PATH_NOT_FOUND = 0xffffffd7 #: Queried path is not found PERMISSION_DENIED = 0xffffffd6 #: Permission denied - @apply - def is_file(): + @dynamic_property + def is_file(self): doc = """ True iff queried path is a file """ def fget(self): return self.code == ListResponse.IS_FILE return property(doc=doc, fget=fget) - @apply - def is_invalid(): + @dynamic_property + def is_invalid(self): doc = """ True iff queried path is invalid """ def fget(self): return self.code == ListResponse.IS_INVALID return property(doc=doc, fget=fget) - @apply - def path_not_found(): + @dynamic_property + def path_not_found(self): doc = """ True iff queried path is not found """ def fget(self): return self.code == ListResponse.PATH_NOT_FOUND return property(doc=doc, fget=fget) - @apply - def permission_denied(): + @dynamic_property + def permission_denied(self): doc = """ True iff permission is denied for path operations """ def fget(self): return self.code == ListResponse.PERMISSION_DENIED return property(doc=doc, fget=fget) - @apply - def is_unmounted(): + @dynamic_property + def is_unmounted(self): doc = """ True iff queried path is unmounted (i.e. removed storage card) """ def fget(self): return self.code == ListResponse.IS_UNMOUNTED return property(doc=doc, fget=fget) - @apply - def is_eol(): + @dynamic_property + def is_eol(self): doc = """ True iff there are no more items in the list """ def fget(self): return self.code == ListResponse.IS_EOL @@ -759,8 +759,8 @@ class FileProperties(Answer): # 0 = default permissions, 4 = read only permissions = field(start=36, fmt=DWORD) - @apply - def is_dir(): + @dynamic_property + def is_dir(self): doc = """True if path points to a directory, False if it points to a file.""" def fget(self): @@ -776,8 +776,8 @@ class FileProperties(Answer): return property(doc=doc, fget=fget, fset=fset) - @apply - def is_readonly(): + @dynamic_property + def is_readonly(self): doc = """ Whether this file is readonly.""" def fget(self): @@ -801,8 +801,8 @@ class IdAnswer(Answer): """ Defines the structure of packets that contain identifiers for queries. """ - @apply - def id(): + @dynamic_property + def id(self): doc = \ """ The identifier. C{unsigned int} stored in 4 bytes @@ -841,8 +841,8 @@ class ListAnswer(Answer): name_length = field(start=20, fmt=DWORD) name = stringfield(name_length, start=24) - @apply - def is_dir(): + @dynamic_property + def is_dir(self): doc = \ """ True if list item points to a directory, False if it points to a file. @@ -859,4 +859,3 @@ class ListAnswer(Answer): return property(doc=doc, fget=fget, fset=fset) - diff --git a/src/calibre/devices/prs505/books.py b/src/calibre/devices/prs505/books.py index b63b089fdd..53ab374613 100644 --- a/src/calibre/devices/prs505/books.py +++ b/src/calibre/devices/prs505/books.py @@ -64,8 +64,8 @@ class Book(object): # When setting this attribute you must use an epoch datetime = book_metadata_field("date", formatter=strptime, setter=strftime) - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): src = self.elem.getAttribute('titleSorter').strip() @@ -76,8 +76,8 @@ class Book(object): self.elem.setAttribute('titleSorter', sortable_title(unicode(val))) return property(doc=doc, fget=fget, fset=fset) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): doc = \ """ The thumbnail. Should be a height 68 image. @@ -99,15 +99,15 @@ class Book(object): return decode(rc) return property(fget=fget, doc=doc) - @apply - def path(): + @dynamic_property + def path(self): doc = """ Absolute path to book on device. Setting not supported. """ def fget(self): return self.mountpath + self.rpath return property(fget=fget, doc=doc) - @apply - def db_id(): + @dynamic_property + def db_id(self): doc = '''The database id in the application database that this file corresponds to''' def fget(self): match = re.search(r'_(\d+)$', self.rpath.rpartition('.')[0]) @@ -412,4 +412,4 @@ def fix_ids(main, card): regen_ids(main) regen_ids(card) - main.set_next_id(str(main.max_id()+1)) + main.set_next_id(str(main.max_id()+1)) \ No newline at end of file diff --git a/src/calibre/devices/usbms/books.py b/src/calibre/devices/usbms/books.py index fffed41549..2875c04b88 100644 --- a/src/calibre/devices/usbms/books.py +++ b/src/calibre/devices/usbms/books.py @@ -21,15 +21,15 @@ class Book(object): def __eq__(self, other): return self.path == other.path - @apply - def title_sorter(): + @dynamic_property + def title_sorter(self): doc = '''String to sort the title. If absent, title is returned''' def fget(self): return re.sub('^\s*A\s+|^\s*The\s+|^\s*An\s+', '', self.title).rstrip() return property(doc=doc, fget=fget) - @apply - def thumbnail(): + @dynamic_property + def thumbnail(self): return None def __str__(self): @@ -44,4 +44,3 @@ class BookList(_BookList): def set_tags(self, book, tags): pass - diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index 9a273c42ce..5e87351375 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -31,8 +31,8 @@ from cssutils import CSSParser class HTMLElement(HtmlElement): - @apply - def specified_font_size(): + @dynamic_property + def specified_font_size(self): def fget(self): ans = self.get('specified_font_size', '') @@ -47,8 +47,8 @@ class HTMLElement(HtmlElement): return property(fget=fget, fset=fset) - @apply - def computed_font_size(): + @dynamic_property + def computed_font_size(self): def fget(self): ans = self.get('computed_font_size', '') if ans == '': @@ -1148,4 +1148,4 @@ output = %s if __name__ == '__main__': - sys.exit(main()) + sys.exit(main()) \ No newline at end of file diff --git a/src/calibre/ebooks/lrf/tags.py b/src/calibre/ebooks/lrf/tags.py index c8ef312ae3..17db193e1a 100644 --- a/src/calibre/ebooks/lrf/tags.py +++ b/src/calibre/ebooks/lrf/tags.py @@ -207,32 +207,32 @@ class Tag(object): s += " at %08X, contents: %s" % (self.offset, repr(self.contents)) return s - @apply - def byte(): + @dynamic_property + def byte(self): def fget(self): if len(self.contents) != 1: raise LRFParseError("Bad parameter for tag ID: %04X" % self.id) return struct.unpack(" Date: Sat, 7 Feb 2009 10:03:00 -0500 Subject: [PATCH 06/13] Refactor OPF de-serialization into OEBReader. --- src/calibre/ebooks/oeb/base.py | 644 ++++-------------------------- src/calibre/ebooks/oeb/factory.py | 20 + src/calibre/ebooks/oeb/reader.py | 535 +++++++++++++++++++++++++ src/calibre/ebooks/oeb/writer.py | 107 +++++ 4 files changed, 742 insertions(+), 564 deletions(-) create mode 100644 src/calibre/ebooks/oeb/factory.py create mode 100644 src/calibre/ebooks/oeb/reader.py create mode 100644 src/calibre/ebooks/oeb/writer.py diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 80d4797905..8eb73935a5 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -6,22 +6,18 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os, sys, re, uuid, copy -from mimetypes import types_map, guess_type +import os, sys, re, uuid +from mimetypes import types_map from collections import defaultdict -from types import StringTypes -from itertools import izip, count, chain +from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote from lxml import etree, html import calibre from calibre import LoggingInterface from calibre.translations.dynamic import translate -from calibre.startup import get_lang from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.entitydefs import ENTITYDEFS -from calibre.ebooks.metadata.epub import CoverRenderer -from calibre.ptempfile import TemporaryDirectory XML_NS = 'http://www.w3.org/XML/1998/namespace' XHTML_NS = 'http://www.w3.org/1999/xhtml' @@ -39,14 +35,13 @@ NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' SVG_NS = 'http://www.w3.org/2000/svg' XLINK_NS = 'http://www.w3.org/1999/xlink' CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' -XPNSMAP = { - 'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, - 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, - 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, - 'svg': SVG_NS, 'xl' : XLINK_NS - } -DC_PREFIXES = ('d11', 'd10', 'd09') - +XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS, + 'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS, + 'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS, + 'svg': SVG_NS, 'xl' : XLINK_NS} +OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} +OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, + 'xsi': XSI_NS, 'calibre': CALIBRE_NS} def XML(name): return '{%s}%s' % (XML_NS, name) @@ -105,7 +100,8 @@ SVG_MIME = types_map['.svg'] BINARY_MIME = 'application/octet-stream' OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css']) -OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document']) +OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, + 'text/x-oeb-document']) OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME]) OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) @@ -167,8 +163,9 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def xml2str(root): - return etree.tostring(root, encoding='utf-8', xml_declaration=True) +def xml2str(root, pretty_print=False): + return etree.tostring(root, encoding='utf-8', xml_declaration=True, + pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) UNIBYTE_CHARS = set(chr(x) for x in xrange(256)) @@ -213,16 +210,38 @@ class Logger(LoggingInterface, object): return object.__getattribute__(self, 'log_' + name) -class AbstractContainer(object): - def read_xml(self, path): - return etree.fromstring( - self.read(path), base_url=os.path.dirname(path)) +class NullContainer(object): + def read(self, path): + raise OEBError('Attempt to read from NullContainer') -class DirContainer(AbstractContainer): - def __init__(self, rootdir): - self.rootdir = unicode(rootdir) + def write(self, path): + raise OEBError('Attempt to write to NullContainer') + + def exists(self, path): + return False + + def namelist(self): + return [] + +class DirContainer(object): + def __init__(self, path): + path = unicode(path) + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = os.path.basename(path) + self.rootdir = os.path.dirname(path) + return + self.rootdir = path + for path in self.namelist(): + ext = os.path.splitext(path)[1].lower() + if ext == '.opf': + self.opfname = fname + return + self.opfname = None def read(self, path): + if path is None: + path = self.opfname path = os.path.join(self.rootdir, path) with open(urlunquote(path), 'rb') as f: return f.read() @@ -239,33 +258,14 @@ class DirContainer(AbstractContainer): path = os.path.join(self.rootdir, path) return os.path.isfile(urlunquote(path)) -class DirWriter(object): - def __init__(self, version='2.0', page_map=False): - self.version = version - self.page_map = page_map - - def dump(self, oeb, path): - version = int(self.version[0]) - opfname = None - if os.path.splitext(path)[1].lower() == '.opf': - opfname = os.path.basename(path) - path = os.path.dirname(path) - if not os.path.isdir(path): - os.mkdir(path) - output = DirContainer(path) - for item in oeb.manifest.values(): - output.write(item.href, str(item)) - if version == 1: - metadata = oeb.to_opf1() - elif version == 2: - metadata = oeb.to_opf2(page_map=self.page_map) - else: - raise OEBError("Unrecognized OPF version %r" % self.version) - for mime, (href, data) in metadata.items(): - if opfname and mime == OPF_MIME: - href = opfname - output.write(href, xml2str(data)) - return + def namelist(self): + names = [] + for root, dirs, files in os.walk(self.rootdir): + for fname in files: + fname = os.path.join(root, fname) + fname = fname.replace('\\', '/') + names.append(fname) + return names class Metadata(object): @@ -279,9 +279,6 @@ class Metadata(object): OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} - OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} - OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, - 'xsi': XSI_NS, 'calibre': CALIBRE_NS} class Item(object): @@ -337,18 +334,20 @@ class Metadata(object): if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - scheme = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'), + scheme = Attribute(lambda term: 'scheme' if \ + term == OPF('meta') else OPF('scheme'), [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) id = Attribute('id') - type = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')]) + type = Attribute(XSI('type'), [DC('date'), DC('format'), + DC('type')]) lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'), - DC('creator'), DC('publisher'), - DC('relation'), DC('rights'), - DC('source'), DC('subject'), - OPF('meta')]) + DC('creator'), DC('publisher'), + DC('relation'), DC('rights'), + DC('source'), DC('subject'), + OPF('meta')]) def __getitem__(self, key): return self.attrib[key] @@ -445,21 +444,19 @@ class Metadata(object): return nsmap return property(fget=fget) - @apply def _opf2_nsmap(): def fget(self): nsmap = self._nsmap - nsmap.update(self.OPF2_NSMAP) + nsmap.update(OPF2_NSMAP) return nsmap return property(fget=fget) - def to_opf1(self, parent=None): nsmap = self._opf1_nsmap nsrmap = dict((value, key) for key, value in nsmap.items()) elem = element(parent, 'metadata', nsmap=nsmap) - dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP) + dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) xmeta = element(elem, 'x-metadata') for term in self.items: for item in self.items[term]: @@ -608,7 +605,7 @@ class Manifest(object): def __str__(self): data = self.data if isinstance(data, etree._Element): - return xml2str(data) + return xml2str(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data.encode('utf-8') return str(data) @@ -756,7 +753,7 @@ class Spine(object): self.items = [] def _linear(self, linear): - if isinstance(linear, StringTypes): + if isinstance(linear, basestring): linear = linear.lower() if linear is None or linear in ('yes', 'true'): linear = True @@ -838,7 +835,7 @@ class Guide(object): ('text', __('Main Text'))] TYPES = set(t for t, _ in _TYPES_TITLES) TITLES = dict(_TYPES_TITLES) - ORDER = dict((t, i) for (t, _), i in izip(_TYPES_TITLES, count(0))) + ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES)) def __init__(self, oeb, type, title, href): self.oeb = oeb @@ -1044,493 +1041,25 @@ class PageList(object): class OEBBook(object): - COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') - COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') - - def __init__(self, opfpath=None, container=None, encoding=None, - logger=FauxLogger()): - if opfpath and not container: - container = DirContainer(os.path.dirname(opfpath)) - opfpath = os.path.basename(opfpath) - self.container = container + def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): self.encoding = encoding + self.pretty_print = pretty_print self.logger = logger - if opfpath or container: - opf = self._read_opf(opfpath) - self._all_from_opf(opf) - - def _clean_opf(self, opf): - nsmap = {} - for elem in opf.iter(tag=etree.Element): - nsmap.update(elem.nsmap) - for elem in opf.iter(tag=etree.Element): - if namespace(elem.tag) in ('', OPF1_NS): - elem.tag = OPF(barename(elem.tag)) - nsmap.update(Metadata.OPF2_NSMAP) - attrib = dict(opf.attrib) - nroot = etree.Element(OPF('package'), - nsmap={None: OPF2_NS}, attrib=attrib) - metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) - ignored = (OPF('dc-metadata'), OPF('x-metadata')) - for elem in xpath(opf, 'o2:metadata//*'): - if elem.tag in ignored: - continue - if namespace(elem.tag) in DC_NSES: - tag = barename(elem.tag).lower() - elem.tag = '{%s}%s' % (DC11_NS, tag) - metadata.append(elem) - for element in xpath(opf, 'o2:metadata//o2:meta'): - metadata.append(element) - for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): - for element in xpath(opf, tag): - nroot.append(element) - return nroot - - def _read_opf(self, opfpath): - data = self.container.read(opfpath) - data = self.decode(data) - data = XMLDECL_RE.sub('', data) - try: - opf = etree.fromstring(data) - except etree.XMLSyntaxError: - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = ENTITY_RE.sub(repl, data) - opf = etree.fromstring(data) - self.logger.warn('OPF contains invalid HTML named entities') - ns = namespace(opf.tag) - if ns not in ('', OPF1_NS, OPF2_NS): - raise OEBError('Invalid namespace %r for OPF document' % ns) - opf = self._clean_opf(opf) - return opf - - def _metadata_from_opf(self, opf): - uid = opf.get('unique-identifier', None) + self.version = '2.0' + self.container = NullContainer() + self.metadata = Metadata(self) self.uid = None - self.metadata = metadata = Metadata(self) - for elem in xpath(opf, '/o2:package/o2:metadata//*'): - term = elem.tag - value = elem.text - attrib = dict(elem.attrib) - nsmap = elem.nsmap - if term == OPF('meta'): - term = qname(attrib.pop('name', None), nsmap) - value = attrib.pop('content', None) - if value: - value = COLLAPSE_RE.sub(' ', value.strip()) - if term and (value or attrib): - metadata.add(term, value, attrib, nsmap=nsmap) - haveuuid = haveid = False - for ident in metadata.identifier: - if unicode(ident).startswith('urn:uuid:'): - haveuuid = True - if 'id' in ident.attrib: - haveid = True - if not (haveuuid and haveid): - bookid = "urn:uuid:%s" % str(uuid.uuid4()) - metadata.add('identifier', bookid, id='calibre-uuid') - if uid is None: - self.logger.warn(u'Unique-identifier not specified') - for item in metadata.identifier: - if not item.id: - continue - if uid is None or item.id == uid: - self.uid = item - break - else: - self.logger.warn(u'Unique-identifier %r not found' % uid) - for ident in metadata.identifier: - if 'id' in ident.attrib: - self.uid = metadata.identifier[0] - break - if not metadata.language: - self.logger.warn(u'Language not specified') - metadata.add('language', get_lang()) - if not metadata.creator: - self.logger.warn('Creator not specified') - metadata.add('creator', self.translate(__('Unknown'))) - if not metadata.title: - self.logger.warn('Title not specified') - metadata.add('title', self.translate(__('Unknown'))) - - def _manifest_add_missing(self): - manifest = self.manifest - known = set(manifest.hrefs) - unchecked = set(manifest.values()) - while unchecked: - new = set() - for item in unchecked: - if (item.media_type in OEB_DOCS or - item.media_type[-4:] in ('/xml', '+xml')) and \ - item.data is not None: - hrefs = [sel(item.data) for sel in LINK_SELECTORS] - for href in chain(*hrefs): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data): - href, _ = urldefrag(match.group('url')) - href = item.abshref(urlnormalize(href)) - scheme = urlparse(href).scheme - if not scheme and href not in known: - new.add(href) - unchecked.clear() - for href in new: - known.add(href) - if not self.container.exists(href): - self.logger.warn('Referenced file %r not found' % href) - continue - self.logger.warn('Referenced file %r not in manifest' % href) - id, _ = manifest.generate(id='added') - guessed = guess_type(href)[0] - media_type = guessed or BINARY_MIME - added = manifest.add(id, href, media_type) - unchecked.add(added) - - def _manifest_from_opf(self, opf): - self.manifest = manifest = Manifest(self) - for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): - id = elem.get('id') - href = elem.get('href') - media_type = elem.get('media-type', None) - if media_type is None: - media_type = elem.get('mediatype', None) - if media_type is None or media_type == 'text/xml': - guessed = guess_type(href)[0] - media_type = guessed or media_type or BINARY_MIME - fallback = elem.get('fallback') - if href in manifest.hrefs: - self.logger.warn(u'Duplicate manifest entry for %r' % href) - continue - if not self.container.exists(href): - self.logger.warn(u'Manifest item %r not found' % href) - continue - if id in manifest.ids: - self.logger.warn(u'Duplicate manifest id %r' % id) - id, href = manifest.generate(id, href) - manifest.add(id, href, media_type, fallback) - self._manifest_add_missing() - - def _spine_add_extra(self): - manifest = self.manifest - spine = self.spine - unchecked = set(spine) - selector = XPath('h:body//h:a/@href') - extras = set() - while unchecked: - new = set() - for item in unchecked: - if item.media_type not in OEB_DOCS: - # TODO: handle fallback chains - continue - for href in selector(item.data): - href, _ = urldefrag(href) - if not href: - continue - href = item.abshref(urlnormalize(href)) - if href not in manifest.hrefs: - continue - found = manifest.hrefs[href] - if found.media_type not in OEB_DOCS or \ - found in spine or found in extras: - continue - new.add(found) - extras.update(new) - unchecked = new - version = int(self.version[0]) - for item in sorted(extras): - if version >= 2: - self.logger.warn( - 'Spine-referenced file %r not in spine' % item.href) - spine.add(item, linear=False) - - def _spine_from_opf(self, opf): - self.spine = spine = Spine(self) - for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): - idref = elem.get('idref') - if idref not in self.manifest: - self.logger.warn(u'Spine item %r not found' % idref) - continue - item = self.manifest[idref] - spine.add(item, elem.get('linear')) - if len(spine) == 0: - raise OEBError("Spine is empty") - self._spine_add_extra() - - def _guide_from_opf(self, opf): - self.guide = guide = Guide(self) - for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): - href = elem.get('href') - path = urldefrag(href)[0] - if path not in self.manifest.hrefs: - self.logger.warn(u'Guide reference %r not found' % href) - continue - guide.add(elem.get('type'), elem.get('title'), href) - - def _find_ncx(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@toc') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == NCX_MIME: - self.manifest.remove(item) - return item - return None - - def _toc_from_navpoint(self, item, toc, navpoint): - children = xpath(navpoint, 'ncx:navPoint') - for child in children: - title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - href = xpath(child, 'ncx:content/@src') - if not title or not href: - continue - href = item.abshref(urlnormalize(href[0])) - path, _ = urldefrag(href) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = child.get('id') - klass = child.get('class') - node = toc.add(title, href, id=id, klass=klass) - self._toc_from_navpoint(item, node, child) - - def _toc_from_ncx(self, item): - if item is None: - return False - ncx = item.data - title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - title = title or unicode(self.metadata.title[0]) - self.toc = toc = TOC(title) - navmaps = xpath(ncx, 'ncx:navMap') - for navmap in navmaps: - self._toc_from_navpoint(item, toc, navmap) - return True - - def _toc_from_tour(self, opf): - result = xpath(opf, 'o2:tours/o2:tour') - if not result: - return False - tour = result[0] - self.toc = toc = TOC(tour.get('title')) - sites = xpath(tour, 'o2:site') - for site in sites: - title = site.get('title') - href = site.get('href') - if not title or not href: - continue - path, _ = urldefrag(urlnormalize(href)) - if path not in self.manifest.hrefs: - self.logger.warn('TOC reference %r not found' % href) - continue - id = site.get('id') - toc.add(title, href, id=id) - return True - - def _toc_from_html(self, opf): - if 'toc' not in self.guide: - return False - self.toc = toc = TOC() - itempath, frag = urldefrag(self.guide['toc'].href) - item = self.manifest.hrefs[itempath] - html = item.data - if frag: - elems = xpath(html, './/*[@id="%s"]' % frag) - if not elems: - elems = xpath(html, './/*[@name="%s"]' % frag) - elem = elems[0] if elems else html - while elem != html and not xpath(elem, './/h:a[@href]'): - elem = elem.getparent() - html = elem - titles = defaultdict(list) - order = [] - for anchor in xpath(html, './/h:a[@href]'): - href = anchor.attrib['href'] - href = item.abshref(urlnormalize(href)) - path, frag = urldefrag(href) - if path not in self.manifest.hrefs: - continue - title = ' '.join(xpath(anchor, './/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if href not in titles: - order.append(href) - titles[href].append(title) - for href in order: - toc.add(' '.join(titles[href]), href) - return True - - def _toc_from_spine(self, opf): - self.toc = toc = TOC() - titles = [] - headers = [] - for item in self.spine: - if not item.linear: continue - html = item.data - title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - if title: - titles.append(title) - headers.append('(unlabled)') - for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): - expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html, expr % tag)) - header = COLLAPSE_RE.sub(' ', header.strip()) - if header: - headers[-1] = header - break - use = titles - if len(titles) > len(set(titles)): - use = headers - for title, item in izip(use, self.spine): - if not item.linear: continue - toc.add(title, item.href) - return True - - def _toc_from_opf(self, opf, item): - if self._toc_from_ncx(item): return - if self._toc_from_tour(opf): return - self.logger.warn('No metadata table of contents found') - if self._toc_from_html(opf): return - self._toc_from_spine(opf) - - def _pages_from_ncx(self, opf, item): - if item is None: - return False - ncx = item.data - ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') - if not ptargets: - return False - pages = self.pages = PageList() - for ptarget in ptargets: - name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) - name = COLLAPSE_RE.sub(' ', name.strip()) - href = xpath(ptarget, 'ncx:content/@src') - if not href: - continue - href = item.abshref(urlnormalize(href[0])) - id = ptarget.get('id') - type = ptarget.get('type', 'normal') - klass = ptarget.get('class') - pages.add(name, href, type=type, id=id, klass=klass) - return True - - def _find_page_map(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@page-map') - if result: - id = result[0] - if id not in self.manifest.ids: - return None - item = self.manifest.ids[id] - self.manifest.remove(item) - return item - for item in self.manifest.values(): - if item.media_type == PAGE_MAP_MIME: - self.manifest.remove(item) - return item - return None - - def _pages_from_page_map(self, opf): - item = self._find_page_map(opf) - if item is None: - return False - pmap = item.data - pages = self.pages = PageList() - for page in xpath(pmap, 'o2:page'): - name = page.get('name', '') - href = page.get('href') - if not href: - continue - name = COLLAPSE_RE.sub(' ', name.strip()) - href = item.abshref(urlnormalize(href)) - type = 'normal' - if not name: - type = 'special' - elif name.lower().strip('ivxlcdm') == '': - type = 'front' - pages.add(name, href, type=type) - return True - - def _pages_from_opf(self, opf, item): - if self._pages_from_ncx(opf, item): return - if self._pages_from_page_map(opf): return + self.manifest = Manifest(self) + self.spine = Spine(self) + self.guide = Guide(self) + self.toc = TOC() self.pages = PageList() - return - - def _cover_from_html(self, hcover): - with TemporaryDirectory('_html_cover') as tdir: - writer = DirWriter() - writer.dump(self, tdir) - path = os.path.join(tdir, urlunquote(hcover.href)) - renderer = CoverRenderer(path) - data = renderer.image_data - id, href = self.manifest.generate('cover', 'cover.jpeg') - item = self.manifest.add(id, href, JPEG_MIME, data=data) - return item - - def _locate_cover_image(self): - if self.metadata.cover: - id = str(self.metadata.cover[0]) - item = self.manifest.ids.get(id, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - else: - self.logger.warn('Invalid cover image @id %r' % id) - hcover = self.spine[0] - if 'cover' in self.guide: - href = self.guide['cover'].href - item = self.manifest.hrefs[href] - media_type = item.media_type - if media_type in OEB_IMAGES: - return item - elif media_type in OEB_DOCS: - hcover = item - html = hcover.data - if MS_COVER_TYPE in self.guide: - href = self.guide[MS_COVER_TYPE].href - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - if self.COVER_SVG_XP(html): - svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) - href = os.path.splitext(hcover.href)[0] + '.svg' - id, href = self.manifest.generate(hcover.id, href) - item = self.manifest.add(id, href, SVG_MIME, data=svg) - return item - if self.COVER_OBJECT_XP(html): - object = self.COVER_OBJECT_XP(html)[0] - href = hcover.abshref(object.get('data')) - item = self.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: - return item - return self._cover_from_html(hcover) - - def _ensure_cover_image(self): - cover = self._locate_cover_image() - if self.metadata.cover: - self.metadata.cover[0].value = cover.id - return - self.metadata.add('cover', cover.id) - - def _all_from_opf(self, opf): - self.version = opf.get('version', '1.2') - self._metadata_from_opf(opf) - self._manifest_from_opf(opf) - self._spine_from_opf(opf) - self._guide_from_opf(opf) - item = self._find_ncx(opf) - self._toc_from_opf(opf, item) - self._pages_from_opf(opf, item) - self._ensure_cover_image() + + @classmethod + def generate(cls, opts): + encoding = opts.encoding + pretty_print = opts.pretty_print + return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): lang = str(self.metadata.language[0]) @@ -1652,16 +1181,3 @@ class OEBBook(object): spine.attrib['page-map'] = id results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) return results - - -def main(argv=sys.argv): - for arg in argv[1:]: - oeb = OEBBook(arg) - for name, doc in oeb.to_opf1().values(): - print etree.tostring(doc, pretty_print=True) - for name, doc in oeb.to_opf2(page_map=True).values(): - print etree.tostring(doc, pretty_print=True) - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py new file mode 100644 index 0000000000..dcb0942e85 --- /dev/null +++ b/src/calibre/ebooks/oeb/factory.py @@ -0,0 +1,20 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import os +from calibre.ebooks.oeb.base import OEBError +from calibre.ebooks.oeb.reader import OEBReader + +__all__ = ['get_reader'] + +READER_REGISTRY = { + '.opf': OEBReader, + } + +def ReaderFactory(path): + ext = os.path.splitext(path)[1].lower() + if not ext: + return OEBReader + return READER_REGISTRY[ext]() diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py new file mode 100644 index 0000000000..2d22ff0cd2 --- /dev/null +++ b/src/calibre/ebooks/oeb/reader.py @@ -0,0 +1,535 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, uuid, copy +from itertools import izip, chain +from urlparse import urldefrag, urlparse +from urllib import unquote as urlunquote +from mimetypes import guess_type +from collections import defaultdict +from lxml import etree +from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ + DC_NSES, OPF +from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ + PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME +from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ + ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE +from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath +from calibre.ebooks.oeb.base import urlnormalize, xml2str +from calibre.ebooks.oeb.base import OEBError, OEBBook, DirContainer +from calibre.ebooks.oeb.writer import OEBWriter +from calibre.ebooks.oeb.entitydefs import ENTITYDEFS +from calibre.ebooks.metadata.epub import CoverRenderer +from calibre.startup import get_lang +from calibre.ptempfile import TemporaryDirectory + +__all__ = ['OEBReader'] + +class OEBReader(object): + + COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') + COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') + + Container = DirContainer + + DEFAULT_PROFILE = 'PRS505' + + def __call__(self, oeb, path): + self.oeb = oeb + self.logger = oeb.logger + oeb.container = self.Container(path) + opf = self._read_opf() + self._all_from_opf(opf) + return oeb + + def _clean_opf(self, opf): + nsmap = {} + for elem in opf.iter(tag=etree.Element): + nsmap.update(elem.nsmap) + for elem in opf.iter(tag=etree.Element): + if namespace(elem.tag) in ('', OPF1_NS): + elem.tag = OPF(barename(elem.tag)) + nsmap.update(OPF2_NSMAP) + attrib = dict(opf.attrib) + nroot = etree.Element(OPF('package'), + nsmap={None: OPF2_NS}, attrib=attrib) + metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) + ignored = (OPF('dc-metadata'), OPF('x-metadata')) + for elem in xpath(opf, 'o2:metadata//*'): + if elem.tag in ignored: + continue + if namespace(elem.tag) in DC_NSES: + tag = barename(elem.tag).lower() + elem.tag = '{%s}%s' % (DC11_NS, tag) + metadata.append(elem) + for element in xpath(opf, 'o2:metadata//o2:meta'): + metadata.append(element) + for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): + for element in xpath(opf, tag): + nroot.append(element) + return nroot + + def _read_opf(self): + data = self.oeb.container.read(None) + data = self.oeb.decode(data) + data = XMLDECL_RE.sub('', data) + try: + opf = etree.fromstring(data) + except etree.XMLSyntaxError: + repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) + data = ENTITY_RE.sub(repl, data) + opf = etree.fromstring(data) + self.logger.warn('OPF contains invalid HTML named entities') + ns = namespace(opf.tag) + if ns not in ('', OPF1_NS, OPF2_NS): + raise OEBError('Invalid namespace %r for OPF document' % ns) + opf = self._clean_opf(opf) + return opf + + def _metadata_from_opf(self, opf): + uid = opf.get('unique-identifier', None) + self.oeb.uid = None + metadata = self.oeb.metadata + for elem in xpath(opf, '/o2:package/o2:metadata//*'): + term = elem.tag + value = elem.text + attrib = dict(elem.attrib) + nsmap = elem.nsmap + if term == OPF('meta'): + term = qname(attrib.pop('name', None), nsmap) + value = attrib.pop('content', None) + if value: + value = COLLAPSE_RE.sub(' ', value.strip()) + if term and (value or attrib): + metadata.add(term, value, attrib, nsmap=nsmap) + haveuuid = haveid = False + for ident in metadata.identifier: + if unicode(ident).startswith('urn:uuid:'): + haveuuid = True + if 'id' in ident.attrib: + haveid = True + if not (haveuuid and haveid): + bookid = "urn:uuid:%s" % str(uuid.uuid4()) + metadata.add('identifier', bookid, id='calibre-uuid') + if uid is None: + self.logger.warn(u'Unique-identifier not specified') + for item in metadata.identifier: + if not item.id: + continue + if uid is None or item.id == uid: + self.oeb.uid = item + break + else: + self.logger.warn(u'Unique-identifier %r not found' % uid) + for ident in metadata.identifier: + if 'id' in ident.attrib: + self.oeb.uid = metadata.identifier[0] + break + if not metadata.language: + self.logger.warn(u'Language not specified') + metadata.add('language', get_lang()) + if not metadata.creator: + self.logger.warn('Creator not specified') + metadata.add('creator', self.oeb.translate(__('Unknown'))) + if not metadata.title: + self.logger.warn('Title not specified') + metadata.add('title', self.oeb.translate(__('Unknown'))) + + def _manifest_add_missing(self): + manifest = self.oeb.manifest + known = set(manifest.hrefs) + unchecked = set(manifest.values()) + while unchecked: + new = set() + for item in unchecked: + if (item.media_type in OEB_DOCS or + item.media_type[-4:] in ('/xml', '+xml')) and \ + item.data is not None: + hrefs = [sel(item.data) for sel in LINK_SELECTORS] + for href in chain(*hrefs): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + elif item.media_type in OEB_STYLES: + for match in CSSURL_RE.finditer(item.data): + href, _ = urldefrag(match.group('url')) + href = item.abshref(urlnormalize(href)) + scheme = urlparse(href).scheme + if not scheme and href not in known: + new.add(href) + unchecked.clear() + for href in new: + known.add(href) + if not self.oeb.container.exists(href): + self.logger.warn('Referenced file %r not found' % href) + continue + self.logger.warn('Referenced file %r not in manifest' % href) + id, _ = manifest.generate(id='added') + guessed = guess_type(href)[0] + media_type = guessed or BINARY_MIME + added = manifest.add(id, href, media_type) + unchecked.add(added) + + def _manifest_from_opf(self, opf): + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + id = elem.get('id') + href = elem.get('href') + media_type = elem.get('media-type', None) + if media_type is None: + media_type = elem.get('mediatype', None) + if media_type is None or media_type == 'text/xml': + guessed = guess_type(href)[0] + media_type = guessed or media_type or BINARY_MIME + fallback = elem.get('fallback') + if href in manifest.hrefs: + self.logger.warn(u'Duplicate manifest entry for %r' % href) + continue + if not self.oeb.container.exists(href): + self.logger.warn(u'Manifest item %r not found' % href) + continue + if id in manifest.ids: + self.logger.warn(u'Duplicate manifest id %r' % id) + id, href = manifest.generate(id, href) + manifest.add(id, href, media_type, fallback) + self._manifest_add_missing() + + def _spine_add_extra(self): + manifest = self.oeb.manifest + spine = self.oeb.spine + unchecked = set(spine) + selector = XPath('h:body//h:a/@href') + extras = set() + while unchecked: + new = set() + for item in unchecked: + if item.media_type not in OEB_DOCS: + # TODO: handle fallback chains + continue + for href in selector(item.data): + href, _ = urldefrag(href) + if not href: + continue + href = item.abshref(urlnormalize(href)) + if href not in manifest.hrefs: + continue + found = manifest.hrefs[href] + if found.media_type not in OEB_DOCS or \ + found in spine or found in extras: + continue + new.add(found) + extras.update(new) + unchecked = new + version = int(self.oeb.version[0]) + for item in sorted(extras): + if version >= 2: + self.logger.warn( + 'Spine-referenced file %r not in spine' % item.href) + spine.add(item, linear=False) + + def _spine_from_opf(self, opf): + spine = self.oeb.spine + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + idref = elem.get('idref') + if idref not in manifest.ids: + self.logger.warn(u'Spine item %r not found' % idref) + continue + item = manifest.ids[idref] + spine.add(item, elem.get('linear')) + if len(spine) == 0: + raise OEBError("Spine is empty") + self._spine_add_extra() + + def _guide_from_opf(self, opf): + guide = self.oeb.guide + manifest = self.oeb.manifest + for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + href = elem.get('href') + path = urldefrag(href)[0] + if path not in manifest.hrefs: + self.logger.warn(u'Guide reference %r not found' % href) + continue + guide.add(elem.get('type'), elem.get('title'), href) + + def _find_ncx(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@toc') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == NCX_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _toc_from_navpoint(self, item, toc, navpoint): + children = xpath(navpoint, 'ncx:navPoint') + for child in children: + title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + href = xpath(child, 'ncx:content/@src') + if not title or not href: + continue + href = item.abshref(urlnormalize(href[0])) + path, _ = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = child.get('id') + klass = child.get('class') + node = toc.add(title, href, id=id, klass=klass) + self._toc_from_navpoint(item, node, child) + + def _toc_from_ncx(self, item): + if item is None: + return False + ncx = item.data + title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + title = title or unicode(self.oeb.metadata.title[0]) + toc = self.oeb.toc + toc.title = title + navmaps = xpath(ncx, 'ncx:navMap') + for navmap in navmaps: + self._toc_from_navpoint(item, toc, navmap) + return True + + def _toc_from_tour(self, opf): + result = xpath(opf, 'o2:tours/o2:tour') + if not result: + return False + tour = result[0] + toc = self.oeb.toc + toc.title = tour.get('title') + sites = xpath(tour, 'o2:site') + for site in sites: + title = site.get('title') + href = site.get('href') + if not title or not href: + continue + path, _ = urldefrag(urlnormalize(href)) + if path not in self.oeb.manifest.hrefs: + self.logger.warn('TOC reference %r not found' % href) + continue + id = site.get('id') + toc.add(title, href, id=id) + return True + + def _toc_from_html(self, opf): + if 'toc' not in self.oeb.guide: + return False + itempath, frag = urldefrag(self.oeb.guide['toc'].href) + item = self.oeb.manifest.hrefs[itempath] + html = item.data + if frag: + elems = xpath(html, './/*[@id="%s"]' % frag) + if not elems: + elems = xpath(html, './/*[@name="%s"]' % frag) + elem = elems[0] if elems else html + while elem != html and not xpath(elem, './/h:a[@href]'): + elem = elem.getparent() + html = elem + titles = defaultdict(list) + order = [] + for anchor in xpath(html, './/h:a[@href]'): + href = anchor.attrib['href'] + href = item.abshref(urlnormalize(href)) + path, frag = urldefrag(href) + if path not in self.oeb.manifest.hrefs: + continue + title = ' '.join(xpath(anchor, './/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if href not in titles: + order.append(href) + titles[href].append(title) + toc = self.oeb.toc + for href in order: + toc.add(' '.join(titles[href]), href) + return True + + def _toc_from_spine(self, opf): + toc = self.oeb.toc + titles = [] + headers = [] + for item in self.oeb.spine: + if not item.linear: continue + html = item.data + title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) + title = COLLAPSE_RE.sub(' ', title.strip()) + if title: + titles.append(title) + headers.append('(unlabled)') + for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): + expr = '/h:html/h:body//h:%s[position()=1]/text()' + header = ''.join(xpath(html, expr % tag)) + header = COLLAPSE_RE.sub(' ', header.strip()) + if header: + headers[-1] = header + break + use = titles + if len(titles) > len(set(titles)): + use = headers + for title, item in izip(use, self.oeb.spine): + if not item.linear: continue + toc.add(title, item.href) + return True + + def _toc_from_opf(self, opf, item): + if self._toc_from_ncx(item): return + if self._toc_from_tour(opf): return + self.logger.warn('No metadata table of contents found') + if self._toc_from_html(opf): return + self._toc_from_spine(opf) + + def _pages_from_ncx(self, opf, item): + if item is None: + return False + ncx = item.data + ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') + if not ptargets: + return False + pages = self.oeb.pages + for ptarget in ptargets: + name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) + name = COLLAPSE_RE.sub(' ', name.strip()) + href = xpath(ptarget, 'ncx:content/@src') + if not href: + continue + href = item.abshref(urlnormalize(href[0])) + id = ptarget.get('id') + type = ptarget.get('type', 'normal') + klass = ptarget.get('class') + pages.add(name, href, type=type, id=id, klass=klass) + return True + + def _find_page_map(self, opf): + result = xpath(opf, '/o2:package/o2:spine/@page-map') + if result: + id = result[0] + if id not in self.oeb.manifest.ids: + return None + item = self.oeb.manifest.ids[id] + self.oeb.manifest.remove(item) + return item + for item in self.oeb.manifest.values(): + if item.media_type == PAGE_MAP_MIME: + self.oeb.manifest.remove(item) + return item + return None + + def _pages_from_page_map(self, opf): + item = self._find_page_map(opf) + if item is None: + return False + pmap = item.data + pages = self.oeb.pages + for page in xpath(pmap, 'o2:page'): + name = page.get('name', '') + href = page.get('href') + if not href: + continue + name = COLLAPSE_RE.sub(' ', name.strip()) + href = item.abshref(urlnormalize(href)) + type = 'normal' + if not name: + type = 'special' + elif name.lower().strip('ivxlcdm') == '': + type = 'front' + pages.add(name, href, type=type) + return True + + def _pages_from_opf(self, opf, item): + if self._pages_from_ncx(opf, item): return + if self._pages_from_page_map(opf): return + return + + def _cover_from_html(self, hcover): + with TemporaryDirectory('_html_cover') as tdir: + writer = OEBWriter() + writer(self.oeb, tdir) + path = os.path.join(tdir, urlunquote(hcover.href)) + renderer = CoverRenderer(path) + data = renderer.image_data + id, href = self.oeb.manifest.generate('cover', 'cover.jpeg') + item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) + return item + + def _locate_cover_image(self): + if self.oeb.metadata.cover: + id = str(self.oeb.metadata.cover[0]) + item = self.oeb.manifest.ids.get(id, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + else: + self.logger.warn('Invalid cover image @id %r' % id) + hcover = self.oeb.spine[0] + if 'cover' in self.oeb.guide: + href = self.oeb.guide['cover'].href + item = self.oeb.manifest.hrefs[href] + media_type = item.media_type + if media_type in OEB_IMAGES: + return item + elif media_type in OEB_DOCS: + hcover = item + html = hcover.data + if MS_COVER_TYPE in self.oeb.guide: + href = self.oeb.guide[MS_COVER_TYPE].href + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + if self.COVER_SVG_XP(html): + svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) + href = os.path.splitext(hcover.href)[0] + '.svg' + id, href = self.oeb.manifest.generate(hcover.id, href) + item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) + return item + if self.COVER_OBJECT_XP(html): + object = self.COVER_OBJECT_XP(html)[0] + href = hcover.abshref(object.get('data')) + item = self.oeb.manifest.hrefs.get(href, None) + if item is not None and item.media_type in OEB_IMAGES: + return item + return self._cover_from_html(hcover) + + def _ensure_cover_image(self): + cover = self._locate_cover_image() + if self.oeb.metadata.cover: + self.oeb.metadata.cover[0].value = cover.id + return + self.oeb.metadata.add('cover', cover.id) + + def _all_from_opf(self, opf): + self.oeb.version = opf.get('version', '1.2') + self._metadata_from_opf(opf) + self._manifest_from_opf(opf) + self._spine_from_opf(opf) + self._guide_from_opf(opf) + item = self._find_ncx(opf) + self._toc_from_opf(opf, item) + self._pages_from_opf(opf, item) + self._ensure_cover_image() + + +def main(argv=sys.argv): + reader = OEBReader() + for arg in argv[1:]: + oeb = reader(OEBBook(), arg) + for name, doc in oeb.to_opf1().values(): + print etree.tostring(doc, pretty_print=True) + for name, doc in oeb.to_opf2(page_map=True).values(): + print etree.tostring(doc, pretty_print=True) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py new file mode 100644 index 0000000000..e55db670d6 --- /dev/null +++ b/src/calibre/ebooks/oeb/writer.py @@ -0,0 +1,107 @@ +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2008, Marshall T. Vandegrift ' + +import sys, os, logging +from calibre.ebooks.oeb.base import OPF_MIME, xml2str +from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook +from calibre.utils.config import Config + +__all__ = ['OEBWriter'] + +class OEBWriter(object): + DEFAULT_PROFILE = 'PRS505' + + def __init__(self, version='2.0', page_map=False, pretty_print=False): + self.version = version + self.page_map = page_map + self.pretty_print = pretty_print + + @classmethod + def config(cls, cfg): + oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) + versions = ['1.2', '2.0'] + oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, + help=_('OPF version to generate. Default is %default.')) + oeb('adobe_page_map', ['--adobe-page-map'], default=False, + help=_('Generate an Adobe "page-map" file if pagination ' + 'information is avaliable.')) + return cfg + + @classmethod + def generate(cls, opts): + version = opts.opf_version + page_map = opts.adobe_page_map + pretty_print = opts.pretty_print + return cls(version=version, page_map=page_map, + pretty_print=pretty_print) + + def __call__(self, oeb, path): + version = int(self.version[0]) + opfname = None + if os.path.splitext(path)[1].lower() == '.opf': + opfname = os.path.basename(path) + path = os.path.dirname(path) + if not os.path.isdir(path): + os.mkdir(path) + output = DirContainer(path) + for item in oeb.manifest.values(): + output.write(item.href, str(item)) + if version == 1: + metadata = oeb.to_opf1() + elif version == 2: + metadata = oeb.to_opf2(page_map=self.page_map) + else: + raise OEBError("Unrecognized OPF version %r" % self.version) + pretty_print = self.pretty_print + for mime, (href, data) in metadata.items(): + if opfname and mime == OPF_MIME: + href = opfname + output.write(href, xml2str(data, pretty_print=pretty_print)) + return + + +def option_parser(): + cfg = Config('oeb', _('Options to control OEB conversion.')) + OEBWriter.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for files. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def any2oeb(opts, inpath): + from calibre.ebooks.oeb.factory import ReaderFactory + logger = Logger(logging.getLogger('any2oeb')) + logger.setup_cli_handler(opts.verbose) + outpath = opts.output + if outpath is None: + outpath = os.path.basename(inpath) + outpath = os.path.splitext(outpath)[0] + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + reader = ReaderFactory(inpath) + reader(oeb, inpath) + writer = OEBWriter.generate(opts) + writer(oeb, outpath) + return 0 + +def main(argv=sys.argv): + parser = option_parser() + opts, args = parser.parse_args(argv[1:]) + if len(args) != 1: + parser.print_help() + return 1 + inpath = args[0] + retval = any2oeb(opts, inpath) + return retval + +if __name__ == '__main__': + sys.exit(main()) From 9c2a4e36eccbe57528a5167717812be4986e78dc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 8 Feb 2009 20:26:01 -0800 Subject: [PATCH 07/13] IGN:... --- src/calibre/ebooks/metadata/cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/metadata/cli.py b/src/calibre/ebooks/metadata/cli.py index 4101f34047..8053b82e90 100644 --- a/src/calibre/ebooks/metadata/cli.py +++ b/src/calibre/ebooks/metadata/cli.py @@ -31,21 +31,21 @@ from calibre import prints def config(): c = StringConfig('') - c.add_opt('title', ['-t', '--title'], + c.add_opt('title', ['-t', '--title'], help=_('Set the title.')) c.add_opt('authors', ['-a', '--authors'], help=_('Set the authors. Multiple authors should be separated ' 'by the & character. Author names should be in the order ' 'Firstname Lastname.')) - c.add_opt('title_sort', ['--title-sort'], + c.add_opt('title_sort', ['--title-sort'], help=_('The version of the title to be used for sorting. ' 'If unspecified, and the title is specified, it will ' 'be auto-generated from the title.')) - c.add_opt('author_sort', ['--author-sort'], + c.add_opt('author_sort', ['--author-sort'], help=_('String to be used when sorting by author. ' 'If unspecified, and the author(s) are specified, it will ' 'be auto-generated from the author(s).')) - c.add_opt('cover', ['--cover'], + c.add_opt('cover', ['--cover'], help=_('Set the cover to the specified file.')) c.add_opt('comments', ['-c', '--comments'], help=_('Set the ebook description.')) @@ -195,4 +195,4 @@ def main(args=sys.argv): return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) From e5984c02c7bc7ded3b2afd7aa4ff5e85a167dd03 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Tue, 10 Feb 2009 23:50:35 -0500 Subject: [PATCH 08/13] Document OEBBook. --- src/calibre/ebooks/lit/writer.py | 2 +- src/calibre/ebooks/oeb/base.py | 308 +++++++++++++++++++++++++++---- 2 files changed, 269 insertions(+), 41 deletions(-) diff --git a/src/calibre/ebooks/lit/writer.py b/src/calibre/ebooks/lit/writer.py index 4a059b6433..bebba8938b 100644 --- a/src/calibre/ebooks/lit/writer.py +++ b/src/calibre/ebooks/lit/writer.py @@ -312,7 +312,7 @@ class LitWriter(object): cover = None if oeb.metadata.cover: id = str(oeb.metadata.cover[0]) - cover = oeb.manifest[id] + cover = oeb.manifest.ids[id] for type, title in ALL_MS_COVER_TYPES: if type not in oeb.guide: oeb.guide.add(type, title, cover.href) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 771a27a81a..ce16fa76e5 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -5,6 +5,7 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' +__docformat__ = 'restructuredtext en' import os, sys, re, uuid from mimetypes import types_map @@ -175,6 +176,7 @@ URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE] def urlquote(href): + """Quote URL-unsafe characters, allowing IRI-safe characters.""" result = [] unsafe = 0 if isinstance(href, unicode) else 1 unsafe = URL_UNSAFE[unsafe] @@ -185,6 +187,9 @@ def urlquote(href): return ''.join(result) def urlnormalize(href): + """Convert a URL into normalized form, with all and only URL-unsafe + characters URL quoted. + """ parts = urlparse(href) if not parts.scheme: path, frag = urldefrag(href) @@ -196,21 +201,30 @@ def urlnormalize(href): class OEBError(Exception): + """Generic OEB-processing error.""" pass class FauxLogger(object): + """Fake logging interface.""" def __getattr__(self, name): return self def __call__(self, message): print message class Logger(LoggingInterface, object): + """A logging object which provides both the standard `logging.Logger` and + calibre-specific interfaces. + """ def __getattr__(self, name): return object.__getattribute__(self, 'log_' + name) class NullContainer(object): + """An empty container. + + For use with book formats which do not support container-like access. + """ def read(self, path): raise OEBError('Attempt to read from NullContainer') @@ -224,6 +238,8 @@ class NullContainer(object): return [] class DirContainer(object): + """Filesystem directory container.""" + def __init__(self, path): path = unicode(path) ext = os.path.splitext(path)[1].lower() @@ -269,20 +285,38 @@ class DirContainer(object): class Metadata(object): - DC_TERMS = set([ - 'contributor', 'coverage', 'creator', 'date', - 'description', 'format', 'identifier', 'language', - 'publisher', 'relation', 'rights', 'source', 'subject', - 'title', 'type' - ]) + """A collection of OEB data model metadata. + + Provides access to the list of items associated with a particular metadata + term via the term's local name using either Python container or attribute + syntax. Return an empty list for any terms with no currently associated + metadata items. + """ + + DC_TERMS = set(['contributor', 'coverage', 'creator', 'date', + 'description', 'format', 'identifier', 'language', + 'publisher', 'relation', 'rights', 'source', + 'subject', 'title', 'type']) CALIBRE_TERMS = set(['series', 'series_index', 'rating']) OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 'scheme': OPF('scheme'), 'event': OPF('event'), 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} class Item(object): - + """An item of OEB data model metadata. + + The metadata term or name may be accessed via the :attr:`term` or + :attr:`name` attributes. The metadata value or content may be accessed + via the :attr:`value` or :attr:`content` attributes, or via Unicode or + string representations of the object. + + OEB data model metadata attributes may be accessed either via their + fully-qualified names using the Python container access syntax, or via + their local names using Python attribute syntax. Only attributes + allowed by the OPF 2.0 specification are supported. + """ class Attribute(object): + """Smart accessor for allowed OEB metadata item attributes.""" def __init__(self, attr, allowed=None): if not callable(attr): @@ -333,10 +367,24 @@ class Metadata(object): nsattr = 'scheme' if attr != nsattr: attrib[nsattr] = attrib.pop(attr) - + + @dynamic_property + def name(self): + def fget(self): + return self.term + return property(fget=fget) + + @dynamic_property + def content(self): + def fget(self): + return self.value + def fset(self, value): + self.value = value + return property(fget=fget, fset=fset) + scheme = Attribute(lambda term: 'scheme' if \ term == OPF('meta') else OPF('scheme'), - [DC('identifier'), OPF('meta')]) + [DC('identifier'), OPF('meta')]) file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')]) role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) event = Attribute(OPF('event'), [DC('date')]) @@ -405,6 +453,7 @@ class Metadata(object): self.items = defaultdict(list) def add(self, term, value, attrib={}, nsmap={}, **kwargs): + """Add a new metadata item.""" item = self.Item(term, value, attrib, nsmap, **kwargs) items = self.items[barename(item.term)] items.append(item) @@ -477,8 +526,40 @@ class Metadata(object): class Manifest(object): + """Collection of files composing an OEB data model book. + + Provides access to the content of the files composing the book and + attributes associated with those files, including their internal paths, + unique identifiers, and MIME types. + + Itself acts as a :class:`set` of manifest items, and provides the following + instance data member for dictionary-like access: + + :attr:`ids`: A dictionary in which the keys are the unique identifiers of + the manifest items and the values are the items themselves. + :attr:`hrefs`: A dictionary in which the keys are the internal paths of the + manifest items and the values are the items themselves. + """ class Item(object): + """An OEB data model book content file. + + Provides the following data members for accessing the file content and + metadata associated with this particular file. + + :attr:`id`: Unique identifier. + :attr:`href`: Book-internal path. + :attr:`media_type`: MIME type of the file content. + :attr:`fallback`: Unique id of any fallback manifest item associated + with this manifest item. + :attr:`spine_position`: Display/reading order index for book textual + content. `None` for manifest items which are not part of the + book's textual content. + :attr:`linear`: `True` for textual content items which are part of the + primary linear reading order and `False` for textual content items + which are not (such as footnotes). Meaningless for items which + have a :attr:`spine_position` of `None`. + """ NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') @@ -584,6 +665,18 @@ class Manifest(object): @dynamic_property def data(self): + doc = """Provides MIME type sensitive access to the manifest + entry's associated content. + + - XHTML, HTML, and variant content is parsed as necessary to + convert and and return as an lxml.etree element in the XHTML + namespace. + - XML content is parsed and returned as an lxml.etree element. + - CSS and CSS-variant content is parsed and returned as a cssutils + CSS DOM stylesheet. + - All other content is returned as a :class:`str` object with no + special parsing. + """ def fget(self): if self._data is not None: return self._data @@ -600,7 +693,7 @@ class Manifest(object): self._data = value def fdel(self): self._data = None - return property(fget, fset, fdel) + return property(fget, fset, fdel, doc=doc) def __str__(self): data = self.data @@ -631,6 +724,9 @@ class Manifest(object): return cmp(skey, okey) def relhref(self, href): + """Convert the URL provided in :param:`href` from a book-absolute + reference to a reference relative to this manifest item. + """ if urlparse(href).scheme: return href if '/' not in self.href: @@ -649,6 +745,9 @@ class Manifest(object): return relhref def abshref(self, href): + """Convert the URL provided in :param:`href` from a reference + relative to this manifest item to a book-absolute reference. + """ if urlparse(href).scheme: return href path, frag = urldefrag(href) @@ -663,25 +762,46 @@ class Manifest(object): def __init__(self, oeb): self.oeb = oeb + self.items = set() self.ids = {} self.hrefs = {} def add(self, id, href, media_type, fallback=None, loader=None, data=None): + """Add a new item to the book manifest. + + The item's :param:`id`, :param:`href`, and :param:`media_type` are all + required. A :param:`fallback` item-id is required for any items with a + MIME type which is not one of the OPS core media types. Either the + item's data itself may be provided with :param:`data`, or a loader + function for the data may be provided with :param:`loader`, or the + item's data may latter be set manually via the :attr:`data` attribute. + """ item = self.Item( self.oeb, id, href, media_type, fallback, loader, data) + self.items.add(item) self.ids[item.id] = item self.hrefs[item.href] = item return item def remove(self, item): + """Removes :param:`item` from the manifest.""" if item in self.ids: item = self.ids[item] del self.ids[item.id] del self.hrefs[item.href] + self.items.remove(item) if item in self.oeb.spine: self.oeb.spine.remove(item) def generate(self, id=None, href=None): + """Generate a new unique identifier and/or internal path for use in + creating a new manifest item, using the provided :param:`id` and/or + :param:`href` as bases. + + Returns an two-tuple of the new id and path. If either :param:`id` or + :param:`href` are `None` then the corresponding item in the return + tuple will also be `None`. + """ if id is not None: base = id index = 1 @@ -698,26 +818,16 @@ class Manifest(object): return id, href def __iter__(self): - for id in self.ids: - yield id - - def __getitem__(self, id): - return self.ids[id] - - def values(self): - for item in self.ids.values(): + for item in self.items: yield item + values = __iter__ - def items(self): - for id, item in self.ids.items(): - yield id, item - - def __contains__(self, key): - return key in self.ids + def __contains__(self, item): + return item in self.items def to_opf1(self, parent=None): elem = element(parent, 'manifest') - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = OEB_DOC_MIME @@ -732,7 +842,7 @@ class Manifest(object): def to_opf2(self, parent=None): elem = element(parent, OPF('manifest')) - for item in self.ids.values(): + for item in self.items: media_type = item.media_type if media_type in OEB_DOCS: media_type = XHTML_MIME @@ -747,7 +857,13 @@ class Manifest(object): class Spine(object): - + """Collection of manifest items composing an OEB data model book's main + textual content. + + The spine manages which manifest items compose the book's main textual + content and the sequence in which they appear. Provides Python container + access as a list-like object. + """ def __init__(self, oeb): self.oeb = oeb self.items = [] @@ -762,12 +878,14 @@ class Spine(object): return linear def add(self, item, linear=None): + """Append :param:`item` to the end of the `Spine`.""" item.linear = self._linear(linear) item.spine_position = len(self.items) self.items.append(item) return item def insert(self, index, item, linear): + """Insert :param:`item` at position :param:`index` in the `Spine`.""" item.linear = self._linear(linear) item.spine_position = index self.items.insert(index, item) @@ -776,6 +894,7 @@ class Spine(object): return item def remove(self, item): + """Remove :param:`item` from the `Spine`.""" index = item.spine_position self.items.pop(index) for i in xrange(index, len(self.items)): @@ -813,9 +932,24 @@ class Spine(object): class Guide(object): + """Collection of references to standard frequently-occurring sections + within an OEB data model book. + + Provides dictionary-like access, in which the keys are the OEB reference + type identifiers and the values are `Reference` objects. + """ class Reference(object): - + """Reference to a standard book section. + + Provides the following instance data members: + + :attr:`type`: Reference type identifier, as chosen from the list + allowed in the OPF 2.0 specification. + :attr:`title`: Human-readable section title. + :attr:`href`: Book-internal URL of the referenced section. May include + a fragment identifier. + """ _TYPES_TITLES = [('cover', __('Cover')), ('title-page', __('Title Page')), ('toc', __('Table of Contents')), @@ -867,17 +1001,19 @@ class Guide(object): @dynamic_property def item(self): + doc = """The manifest item associated with this reference.""" def fget(self): path = urldefrag(self.href)[0] hrefs = self.oeb.manifest.hrefs return hrefs.get(path, None) - return property(fget=fget) + return property(fget=fget, doc=doc) def __init__(self, oeb): self.oeb = oeb self.refs = {} def add(self, type, title, href): + """Add a new reference to the `Guide`.""" ref = self.Reference(self.oeb, type, title, href) self.refs[type] = ref return ref @@ -925,8 +1061,19 @@ class Guide(object): return elem +# TODO: This needs beefing up to support the interface of toc.TOC class TOC(object): - # This needs beefing up to support the interface of toc.TOC + """Represents a hierarchical table of contents or navigation tree for + accessing arbitrary semantic sections within an OEB data model book. + + Acts as a node within the navigation tree. Provides list-like access to + sub-nodes. Provides the follow node instance data attributes: + + :attr:`title`: The title of this navigation node. + :attr:`href`: Book-internal URL referenced by this node. + :attr:`klass`: Optional semantic class referenced by this node. + :attr:`id`: Option unique identifier for this node. + """ def __init__(self, title=None, href=None, klass=None, id=None): self.title = title self.href = urlnormalize(href) if href else href @@ -935,17 +1082,26 @@ class TOC(object): self.nodes = [] def add(self, title, href, klass=None, id=None): + """Create and return a new sub-node of this node.""" node = TOC(title, href, klass, id) self.nodes.append(node) return node + def iter(self): + """Iterate over this node and all descendants in depth-first order.""" + yield self + for child in self.nodes: + for node in child.iter(): + yield node + def iterdescendants(self): - for node in self.nodes: - yield node - for child in node.iterdescendants(): - yield child + """Iterate over all descendant nodes in depth-first order.""" + for child in self.nodes: + for node in child.iter(): + yield node def __iter__(self): + """Iterate over all immediate child nodes.""" for node in self.nodes: yield node @@ -953,6 +1109,9 @@ class TOC(object): return self.nodes[index] def autolayer(self): + """Make sequences of children pointing to the same content file into + children of the first node referencing that file. + """ prev = None for node in list(self.nodes): if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]: @@ -961,10 +1120,12 @@ class TOC(object): else: prev = node - def depth(self, level=0): - if self.nodes: - return self.nodes[0].depth(level+1) - return level + def depth(self): + """The maximum depth of the navigation tree rooted at this node.""" + try: + return max(node.depth() for node in self.nodes) + 1 + except ValueError: + return 1 def to_opf1(self, tour): for node in self.nodes: @@ -989,12 +1150,34 @@ class TOC(object): class PageList(object): + """Collection of named "pages" to mapped positions within an OEB data model + book's textual content. + + Provides list-like access to the pages. + """ class Page(object): + """Represents a mapping between a page name and a position within + the book content. + + Provides the following instance data attributes: + + :attr:`name`: The name of this page. Generally a number. + :attr:`href`: Book-internal URL at which point this page begins. + :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly + labeled in print with small-case Roman numerals), 'normal' (for + standard pages, as commonly labeled in print with Arabic numerals), + or 'special' (for other pages, as commonly not labeled in any + fashion in print, such as the cover and title pages). + :attr:`klass`: Optional semantic class of this page. + :attr:`id`: Optional unique identifier for this page. + """ + TYPES = set(['front', 'normal', 'special']) + def __init__(self, name, href, type='normal', klass=None, id=None): - self.name = name + self.name = unicode(name) self.href = urlnormalize(href) - self.type = type + self.type = type if type in self.TYPES else 'normal' self.id = id self.klass = klass @@ -1002,6 +1185,7 @@ class PageList(object): self.pages = [] def add(self, name, href, type='normal', klass=None, id=None): + """Create a new page and add it to the `PageList`.""" page = self.Page(name, href, type, klass, id) self.pages.append(page) return page @@ -1015,6 +1199,12 @@ class PageList(object): def __getitem__(self, index): return self.pages[index] + + def pop(self, index=-1): + return self.pages.pop(index) + + def remove(self, page): + return self.pages.remove(page) def to_ncx(self, parent=None): plist = element(parent, NCX('pageList'), id=str(uuid.uuid4())) @@ -1040,8 +1230,33 @@ class PageList(object): class OEBBook(object): + """Representation of a book in the IDPF OEB data model.""" def __init__(self, encoding=None, pretty_print=False, logger=FauxLogger()): + """Create empty book. Optional arguments: + + :param:`encoding`: Default encoding for textual content read + from an external container. + :param:`pretty_print`: Whether or not the canonical string form + of XML markup is pretty-printed. + :prama:`logger`: A Logger object to use for logging all messages + related to the processing of this book. It is accessible + via the instance data member :attr:`logger`. + + It provides the following public instance data members for + accessing various parts of the OEB data model: + + :attr:`metadata`: Metadata such as title, author name(s), etc. + :attr:`manifest`: Manifest of all files included in the book, + including MIME types and fallback information. + :attr:`spine`: In-order list of manifest items which compose + the textual content of the book. + :attr:`guide`: Collection of references to standard positions + within the text, such as the cover, preface, etc. + :attr:`toc`: Hierarchical table of contents. + :attr:`pages`: List of "pages," such as indexed to a print edition of + the same text. + """ self.encoding = encoding self.pretty_print = pretty_print self.logger = logger @@ -1057,16 +1272,19 @@ class OEBBook(object): @classmethod def generate(cls, opts): + """Generate an OEBBook instance from command-line options.""" encoding = opts.encoding pretty_print = opts.pretty_print return cls(encoding=encoding, pretty_print=pretty_print) def translate(self, text): + """Translate :param:`text` into the book's primary language.""" lang = str(self.metadata.language[0]) lang = lang.split('-', 1)[0].lower() return translate(lang, text) def decode(self, data): + """Automatically decode :param:`data` into a `unicode` object.""" if isinstance(data, unicode): return data if data[:2] in ('\xff\xfe', '\xfe\xff'): @@ -1089,6 +1307,11 @@ class OEBBook(object): return data def to_opf1(self): + """Produce OPF 1.2 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ package = etree.Element('package', attrib={'unique-identifier': self.uid.id}) self.metadata.to_opf1(package) @@ -1160,6 +1383,11 @@ class OEBBook(object): return ncx def to_opf2(self, page_map=False): + """Produce OPF 2.0 representing the book's metadata and structure. + + Returns a dictionary in which the keys are MIME types and the values + are tuples of (default) filenames and lxml.etree element structures. + """ results = {} package = etree.Element(OPF('package'), attrib={'version': '2.0', 'unique-identifier': self.uid.id}, From 5dca63111427af5a8caddbff0d96a63b1bc9f5fe Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 11 Feb 2009 10:00:54 -0500 Subject: [PATCH 09/13] Demonstrable modularization of e-book conversion. --- src/calibre/ebooks/lit/reader.py | 1 + src/calibre/ebooks/mobi/mobiml.py | 10 ++- src/calibre/ebooks/mobi/writer.py | 40 +++++++-- src/calibre/ebooks/oeb/base.py | 11 +-- src/calibre/ebooks/oeb/factory.py | 87 +++++++++++++++++-- src/calibre/ebooks/oeb/reader.py | 24 +++++ src/calibre/ebooks/oeb/transforms/flatcss.py | 10 ++- src/calibre/ebooks/oeb/transforms/htmltoc.py | 13 ++- .../ebooks/oeb/transforms/manglecase.py | 10 ++- .../ebooks/oeb/transforms/rasterize.py | 10 ++- .../ebooks/oeb/transforms/trimmanifest.py | 10 ++- src/calibre/ebooks/oeb/writer.py | 57 +++--------- 12 files changed, 210 insertions(+), 73 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index dd42434101..8cbb9514a8 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -802,6 +802,7 @@ class LitFile(object): class LitContainer(object): + """Simple Container-interface, read-only accessor for LIT files.""" def __init__(self, filename_or_stream): self._litfile = LitFile(filename_or_stream) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 7ecd127452..b7418a5d19 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -82,7 +82,15 @@ class MobiMLizer(object): def __init__(self, ignore_tables=False): self.ignore_tables = ignore_tables - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') self.oeb = oeb self.profile = profile = context.dest diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 380bdbf518..1b5d3ae652 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -295,6 +295,11 @@ class Serializer(object): class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') + + DEFAULT_PROFILE = 'CybookG3' + + TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer, + ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, prefer_author_sort=False): @@ -302,7 +307,32 @@ class MobiWriter(object): self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE self._prefer_author_sort = prefer_author_sort - def dump(self, oeb, path): + @classmethod + def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ + mobi = cfg.add_group('mobipocket', _('Mobipocket-specific options.')) + mobi('compress', ['--compress'], default=False, + help=_('Compress file text using PalmDOC compression. ' + 'Results in smaller files, but takes a long time to run.')) + mobi('rescale_images', ['--rescale-images'], default=False, + help=_('Modify images to meet Palm device size limitations.')) + mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, + help=_('When present, use the author sorting information for ' + 'generating the Mobipocket author metadata.')) + return cfg + + @classmethod + def generate(cls, opts): + """Generate a Writer instance from command-line options.""" + compression = PALMDOC if opts.compress else UNCOMPRESSED + imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None + prefer_author_sort = opts.prefer_author_sort + return cls(compression=compression, imagemax=imagemax, + prefer_author_sort=prefer_author_sort) + + def __call__(self, oeb, path): if hasattr(path, 'write'): return self._dump_stream(oeb, path) with open(path, 'w+b') as stream: @@ -533,20 +563,12 @@ def config(defaults=None): c = StringConfig(defaults, desc) mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('compress', ['--compress'], default=False, - help=_('Compress file text using PalmDOC compression. ' - 'Results in smaller files, but takes a long time to run.')) - mobi('rescale_images', ['--rescale-images'], default=False, - help=_('Modify images to meet Palm device size limitations.')) mobi('toc_title', ['--toc-title'], default=None, help=_('Title for any generated in-line table of contents.')) mobi('ignore_tables', ['--ignore-tables'], default=False, help=_('Render HTML tables as blocks of text instead of actual ' 'tables. This is neccessary if the HTML contains very large ' 'or complex tables.')) - mobi('prefer_author_sort', ['--prefer-author-sort'], default=False, - help=_('When present, use the author sorting information for ' - 'generating the Mobipocket author metadata.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index ce16fa76e5..c9d01b03fe 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -820,8 +820,10 @@ class Manifest(object): def __iter__(self): for item in self.items: yield item - values = __iter__ + def values(self): + return list(self.items) + def __contains__(self, item): return item in self.items @@ -1134,7 +1136,7 @@ class TOC(object): node.to_opf1(tour) return tour - def to_ncx(self, parent, depth=1): + def to_ncx(self, parent): for node in self.nodes: id = node.id or unicode(uuid.uuid4()) attrib = {'id': id, 'playOrder': '0'} @@ -1143,9 +1145,8 @@ class TOC(object): point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) element(label, NCX('text')).text = node.title - href = node.href if depth > 1 else urldefrag(node.href)[0] - element(point, NCX('content'), src=href) - node.to_ncx(point, depth+1) + element(point, NCX('content'), src=node.href) + node.to_ncx(point) return parent diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py index 1ce33a4f00..684451044b 100644 --- a/src/calibre/ebooks/oeb/factory.py +++ b/src/calibre/ebooks/oeb/factory.py @@ -6,20 +6,93 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import os +import sys, os, logging +from itertools import chain from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.reader import OEBReader +from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.lit.reader import LitReader +from calibre.ebooks.lit.writer import LitWriter +from calibre.ebooks.mobi.reader import MobiReader +from calibre.ebooks.mobi.writer import MobiWriter +from calibre.ebooks.oeb.base import Logger, OEBBook +from calibre.ebooks.oeb.profile import Context +from calibre.utils.config import Config __all__ = ['get_reader'] -READER_REGISTRY = { - '.opf': OEBReader, - '.lit': LitReader, +REGISTRY = { + '.opf': (OEBReader, None), + '.lit': (LitReader, LitWriter), + '.mobi': (MobiReader, MobiWriter), } def ReaderFactory(path): - ext = os.path.splitext(path)[1].lower() - if not ext: + if os.path.isdir(path): return OEBReader - return READER_REGISTRY[ext]() + ext = os.path.splitext(path)[1].lower() + Reader = REGISTRY.get(ext, (None, None))[0] + if Reader is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Reader + +def WriterFactory(path): + if os.path.isdir(path): + return OEBWriter + ext = os.path.splitext(path)[1].lower() + if not os.path.exists(path) and not ext: + return OEBWriter + Writer = REGISTRY.get(ext, (None, None))[1] + if Writer is None: + raise OEBError('Unknown e-book file extension %r' % ext) + return Writer + + +def option_parser(Reader, Writer): + cfg = Config('ebook-convert', _('Options to control e-book conversion.')) + Reader.config(cfg) + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + Transform.config(cfg) + Writer.config(cfg) + parser = cfg.option_parser() + parser.add_option('--encoding', default=None, + help=_('Character encoding for input. Default is to auto detect.')) + parser.add_option('-o', '--output', default=None, + help=_('Output file. Default is derived from input filename.')) + parser.add_option('-p', '--pretty-print', action='store_true', + default=False, help=_('Produce more human-readable XML output.')) + parser.add_option('-v', '--verbose', default=0, action='count', + help=_('Useful for debugging.')) + return parser + +def main(argv=sys.argv): + if len(argv) < 3: + print _("Usage: ebook-convert INFILE OUTFILE [OPTIONS..]") + return 1 + inpath, outpath = argv[1], argv[2] + Reader = ReaderFactory(inpath) + Writer = WriterFactory(outpath) + parser = option_parser(Reader, Writer) + opts, args = parser.parse_args(argv[3:]) + if len(args) != 0: + parser.print_help() + return 1 + logger = Logger(logging.getLogger('ebook-convert')) + logger.setup_cli_handler(opts.verbose) + encoding = opts.encoding + pretty_print = opts.pretty_print + oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) + context = Context(Reader.DEFAULT_PROFILE, Writer.DEFAULT_PROFILE) + reader = Reader.generate(opts) + writer = Writer.generate(opts) + transforms = [] + for Transform in chain(Reader.TRANSFORMS, Writer.TRANSFORMS): + transforms.append(Transform.generate(opts)) + reader(oeb, inpath) + for transform in transforms: + transform(oeb, context) + writer(oeb, outpath) + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index aa23ce1e96..0fce1c2b0d 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -31,15 +31,39 @@ from calibre.ptempfile import TemporaryDirectory __all__ = ['OEBReader'] class OEBReader(object): + """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') Container = DirContainer + """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content read with this Reader.""" + + TRANSFORMS = [] + """List of transforms to apply to content read with this Reader.""" + + def __init__(self): + return + @classmethod + def config(cls, cfg): + """Add any book-reading options to the :class:`Config` object + :param:`cfg`. + """ + return + + @classmethod + def generate(cls, opts): + """Generate a Reader instance from command-line options.""" + return cls() + def __call__(self, oeb, path): + """Read the book at :param:`path` into the :class:`OEBBook` object + :param:`oeb`. + """ self.oeb = oeb self.logger = oeb.logger oeb.container = self.Container(path) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 01afcb08e2..ac9684a624 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -94,7 +94,15 @@ class CSSFlattener(object): self.unfloat = unfloat self.untable = untable - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.oeb = oeb self.context = context diff --git a/src/calibre/ebooks/oeb/transforms/htmltoc.py b/src/calibre/ebooks/oeb/transforms/htmltoc.py index 5508b58ec3..0040f39c14 100644 --- a/src/calibre/ebooks/oeb/transforms/htmltoc.py +++ b/src/calibre/ebooks/oeb/transforms/htmltoc.py @@ -52,7 +52,18 @@ class HTMLTOCAdder(object): self.title = title self.style = style - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + group = cfg.add_group('htmltoc', _('HTML TOC generation options.')) + group('toc_title', ['--toc-title'], default=None, + help=_('Title for any generated in-line table of contents.')) + return cfg + + @classmethod + def generate(cls, opts): + return cls(title=opts.toc_title) + + def __call__(self, oeb, context): if 'toc' in oeb.guide: return oeb.logger.info('Generating in-line TOC...') diff --git a/src/calibre/ebooks/oeb/transforms/manglecase.py b/src/calibre/ebooks/oeb/transforms/manglecase.py index 3a3d91364f..c819475a4d 100644 --- a/src/calibre/ebooks/oeb/transforms/manglecase.py +++ b/src/calibre/ebooks/oeb/transforms/manglecase.py @@ -29,7 +29,15 @@ CASE_MANGLER_CSS = """ TEXT_TRANSFORMS = set(['capitalize', 'uppercase', 'lowercase']) class CaseMangler(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Applying case-transforming CSS...') self.oeb = oeb self.profile = context.source diff --git a/src/calibre/ebooks/oeb/transforms/rasterize.py b/src/calibre/ebooks/oeb/transforms/rasterize.py index 12a2812898..aef5c2c98b 100644 --- a/src/calibre/ebooks/oeb/transforms/rasterize.py +++ b/src/calibre/ebooks/oeb/transforms/rasterize.py @@ -34,7 +34,15 @@ class SVGRasterizer(object): if QApplication.instance() is None: QApplication([]) - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Rasterizing SVG images...') self.oeb = oeb self.profile = context.dest diff --git a/src/calibre/ebooks/oeb/transforms/trimmanifest.py b/src/calibre/ebooks/oeb/transforms/trimmanifest.py index a1d28e5a99..a5e7042617 100644 --- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py +++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py @@ -17,7 +17,15 @@ from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE from calibre.ebooks.oeb.base import urlnormalize class ManifestTrimmer(object): - def transform(self, oeb, context): + @classmethod + def config(cls, cfg): + return cfg + + @classmethod + def generate(cls, opts): + return cls() + + def __call__(self, oeb, context): oeb.logger.info('Trimming unused files from manifest...') used = set() hrefs = oeb.manifest.hrefs diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index c84db30c98..235965b50f 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -9,13 +9,16 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, logging from calibre.ebooks.oeb.base import OPF_MIME, xml2str from calibre.ebooks.oeb.base import Logger, DirContainer, OEBBook -from calibre.utils.config import Config __all__ = ['OEBWriter'] class OEBWriter(object): DEFAULT_PROFILE = 'PRS505' + """Default renderer profile for content written with this Writer.""" + TRANSFORMS = [] + """List of transforms to apply to content written with this Writer.""" + def __init__(self, version='2.0', page_map=False, pretty_print=False): self.version = version self.page_map = page_map @@ -23,6 +26,9 @@ class OEBWriter(object): @classmethod def config(cls, cfg): + """Add any book-writing options to the :class:`Config` object + :param:`cfg`. + """ oeb = cfg.add_group('oeb', _('OPF/NCX/etc. generation options.')) versions = ['1.2', '2.0'] oeb('opf_version', ['--opf-version'], default='2.0', choices=versions, @@ -34,6 +40,7 @@ class OEBWriter(object): @classmethod def generate(cls, opts): + """Generate a Writer instance from command-line options.""" version = opts.opf_version page_map = opts.adobe_page_map pretty_print = opts.pretty_print @@ -41,6 +48,9 @@ class OEBWriter(object): pretty_print=pretty_print) def __call__(self, oeb, path): + """Read the book in the :class:`OEBBook` object :param:`oeb` to a file + at :param:`path`. + """ version = int(self.version[0]) opfname = None if os.path.splitext(path)[1].lower() == '.opf': @@ -63,48 +73,3 @@ class OEBWriter(object): href = opfname output.write(href, xml2str(data, pretty_print=pretty_print)) return - - -def option_parser(): - cfg = Config('oeb', _('Options to control OEB conversion.')) - OEBWriter.config(cfg) - parser = cfg.option_parser() - parser.add_option('--encoding', default=None, - help=_('Character encoding for files. Default is to auto detect.')) - parser.add_option('-o', '--output', default=None, - help=_('Output file. Default is derived from input filename.')) - parser.add_option('-p', '--pretty-print', action='store_true', - default=False, help=_('Produce more human-readable XML output.')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Useful for debugging.')) - return parser - -def any2oeb(opts, inpath): - from calibre.ebooks.oeb.factory import ReaderFactory - logger = Logger(logging.getLogger('any2oeb')) - logger.setup_cli_handler(opts.verbose) - outpath = opts.output - if outpath is None: - outpath = os.path.basename(inpath) - outpath = os.path.splitext(outpath)[0] - encoding = opts.encoding - pretty_print = opts.pretty_print - oeb = OEBBook(encoding=encoding, pretty_print=pretty_print, logger=logger) - reader = ReaderFactory(inpath) - reader(oeb, inpath) - writer = OEBWriter.generate(opts) - writer(oeb, outpath) - return 0 - -def main(argv=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(argv[1:]) - if len(args) != 1: - parser.print_help() - return 1 - inpath = args[0] - retval = any2oeb(opts, inpath) - return retval - -if __name__ == '__main__': - sys.exit(main()) From 459d350af3634a8ca1fbf1498f985c5a96ec325a Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Wed, 11 Feb 2009 10:32:08 -0500 Subject: [PATCH 10/13] Pretty much full utility for LIT->MOBI direct conversion pipeline. --- src/calibre/ebooks/mobi/mobiml.py | 7 ++++++- src/calibre/ebooks/mobi/writer.py | 26 +++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index b7418a5d19..534366da7d 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -84,11 +84,16 @@ class MobiMLizer(object): @classmethod def config(cls, cfg): + group = cfg.add_group('mobiml', _('Mobipocket markup options.')) + group('ignore_tables', ['--ignore-tables'], default=False, + help=_('Render HTML tables as blocks of text instead of actual ' + 'tables. This is neccessary if the HTML contains very ' + 'large or complex tables.')) return cfg @classmethod def generate(cls, opts): - return cls() + return cls(ignore_tables=opts.ignore_tables) def __call__(self, oeb, context): oeb.logger.info('Converting XHTML to Mobipocket markup...') diff --git a/src/calibre/ebooks/mobi/writer.py b/src/calibre/ebooks/mobi/writer.py index 1b5d3ae652..86ac6f6dc9 100644 --- a/src/calibre/ebooks/mobi/writer.py +++ b/src/calibre/ebooks/mobi/writer.py @@ -292,13 +292,28 @@ class Serializer(object): buffer.seek(hoff) buffer.write('%010d' % ioff) - + +class MobiFlattener(object): + def config(self, cfg): + return cfg + + def generate(self, opts): + return self + + def __call__(self, oeb, context): + fbase = context.dest.fbase + fkey = context.dest.fnums.values() + flattener = CSSFlattener( + fbase=fbase, fkey=fkey, unfloat=True, untable=True) + return flattener(oeb, context) + + class MobiWriter(object): COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') DEFAULT_PROFILE = 'CybookG3' - TRANSFORMS = [HTMLTOCAdder, CaseMangler, CSSFlattener, SVGRasterizer, + TRANSFORMS = [HTMLTOCAdder, CaseMangler, MobiFlattener(), SVGRasterizer, ManifestTrimmer, MobiMLizer] def __init__(self, compression=None, imagemax=None, @@ -562,13 +577,6 @@ def config(defaults=None): else: c = StringConfig(defaults, desc) - mobi = c.add_group('mobipocket', _('Mobipocket-specific options.')) - mobi('toc_title', ['--toc-title'], default=None, - help=_('Title for any generated in-line table of contents.')) - mobi('ignore_tables', ['--ignore-tables'], default=False, - help=_('Render HTML tables as blocks of text instead of actual ' - 'tables. This is neccessary if the HTML contains very large ' - 'or complex tables.')) profiles = c.add_group('profiles', _('Device renderer profiles. ' 'Affects conversion of font sizes, image rescaling and rasterization ' 'of tables. Valid profiles are: %s.') % ', '.join(_profiles)) From cb9f9c9ff1be450663d10822c89cbd3a71deb344 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 19 Feb 2009 19:22:17 -0800 Subject: [PATCH 11/13] IGN:... --- src/calibre/customize/__init__.py | 4 +- src/calibre/customize/conversion.py | 72 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 src/calibre/customize/conversion.py diff --git a/src/calibre/customize/__init__.py b/src/calibre/customize/__init__.py index 3d48f42535..b43b242fd8 100644 --- a/src/calibre/customize/__init__.py +++ b/src/calibre/customize/__init__.py @@ -220,4 +220,6 @@ class MetadataWriterPlugin(Plugin): ''' pass - + + + \ No newline at end of file diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py new file mode 100644 index 0000000000..36b2781c9d --- /dev/null +++ b/src/calibre/customize/conversion.py @@ -0,0 +1,72 @@ +''' +Defines the plugin sytem for conversions. +''' +import re + +from calibre.customize import Plugin + + +class ConversionOption(object): + + ''' + Class representing conversion options + ''' + + def __init__(self, name=None, default=None, help=None, long_switch=None, + short_switch=None, choices=None, gui_label=None, + category=None): + self.name = name + self.default = default + self.help = help + self.long_switch = long_switch + self.short_switch = short_switch + self.choices = choices + self.gui_label = gui_label + self.category = category + + self.validate_parameters() + + def validate_parameters(self): + ''' + Validate the parameters passed to :method:`__init__`. + ''' + if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None: + raise ValueError(self.name + ' is not a valid Python identifier') + if not (isinstance(self.default, (int, float, str, unicode)) or \ + self.default is None): + raise ValueError(unicode(self.default) + + ' is not a string or a number') + if not self.help: + raise ValueError('You must set the help text') + +class ConversionPlugin(Plugin): + + ''' + The base class for all conversion related plugins. + ''' + #: List of options + #: Each option must be a dictionary. The dictionary can contain several + #: keys defining the option. The ones marked by a * are required, the rest + #: are optional. The keys are:: + #: + #: *'name' : A valid python identifier. + #: *'default' : The default value for this option. + #: *'help' : + #: 'short_switch' : A suggestion for a short form of the command line + #: switch (for example if name is 'title', this + #: could be 't'). It is only used if no prior + #: conversion plugin has claimed it. + options = [] + + type = _('Conversion') + can_be_disabled = False + supported_platforms = ['windows', 'osx', 'linux'] + + +class InputFormatPlugin(ConversionPlugin): + + #: Set of file types for which this plugin should be run + #: For example: ``set(['lit', 'mobi', 'prc'])`` + file_types = set([]) + + From 32968d2332be57cd2cc653acc9c3f06c42ae9aee Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 25 Feb 2009 10:01:31 -0800 Subject: [PATCH 12/13] IGN:... --- .pydevproject | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pydevproject b/.pydevproject index 1d5708bb48..509137a36a 100644 --- a/.pydevproject +++ b/.pydevproject @@ -2,7 +2,7 @@ -python 2.5 +python 2.6 /calibre-pluginize/src From 925a86fb0c991c51a4665cac1ff7a7f191ec39a7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 6 Mar 2009 21:38:35 -0800 Subject: [PATCH 13/13] Beginnings of the new conversion framework. Input plugins for MOBI and EPUB. --- src/calibre/__init__.py | 92 ++--------- src/calibre/customize/builtins.py | 7 +- src/calibre/customize/conversion.py | 183 ++++++++++++++++++---- src/calibre/customize/profiles.py | 27 ++++ src/calibre/customize/ui.py | 19 ++- src/calibre/ebooks/conversion/__init__.py | 4 + src/calibre/ebooks/conversion/plumber.py | 30 ++++ src/calibre/ebooks/epub/__init__.py | 32 ---- src/calibre/ebooks/epub/input.py | 76 +++++++++ src/calibre/ebooks/mobi/input.py | 29 ++++ src/calibre/ebooks/mobi/reader.py | 167 +++++++++----------- src/calibre/utils/logging.py | 92 +++++++++++ src/calibre/utils/terminfo.py | 2 +- 13 files changed, 525 insertions(+), 235 deletions(-) create mode 100644 src/calibre/customize/profiles.py create mode 100644 src/calibre/ebooks/conversion/__init__.py create mode 100644 src/calibre/ebooks/conversion/plumber.py create mode 100644 src/calibre/ebooks/epub/input.py create mode 100644 src/calibre/ebooks/mobi/input.py create mode 100644 src/calibre/utils/logging.py diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index e69d42c90a..de133ddb57 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -90,28 +90,11 @@ def prints(*args, **kwargs): if i != len(args)-1: file.write(sep) file.write(end) - file.flush() class CommandLineError(Exception): pass -class ColoredFormatter(Formatter): - def format(self, record): - ln = record.__dict__['levelname'] - col = '' - if ln == 'CRITICAL': - col = terminal_controller.YELLOW - elif ln == 'ERROR': - col = terminal_controller.RED - elif ln in ['WARN', 'WARNING']: - col = terminal_controller.BLUE - elif ln == 'INFO': - col = terminal_controller.GREEN - elif ln == 'DEBUG': - col = terminal_controller.CYAN - record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL - return Formatter.format(self, record) def setup_cli_handlers(logger, level): @@ -335,66 +318,23 @@ def english_sort(x, y): ''' return cmp(_spat.sub('', x), _spat.sub('', y)) -class LoggingInterface: +class ColoredFormatter(Formatter): - def __init__(self, logger): - self.__logger = self.logger = logger - - def setup_cli_handler(self, verbosity): - for handler in self.__logger.handlers: - if isinstance(handler, logging.StreamHandler): - return - if os.environ.get('CALIBRE_WORKER', None) is not None and self.__logger.handlers: - return - stream = sys.stdout - formatter = logging.Formatter() - level = logging.INFO - if verbosity > 0: - formatter = ColoredFormatter('[%(levelname)s] %(message)s') if verbosity > 1 else \ - ColoredFormatter('%(levelname)s: %(message)s') - level = logging.DEBUG - if verbosity > 1: - stream = sys.stderr - - handler = logging.StreamHandler(stream) - handler.setFormatter(formatter) - handler.setLevel(level) - self.__logger.addHandler(handler) - self.__logger.setLevel(level) - - - def ___log(self, func, msg, args, kwargs): - args = [msg] + list(args) - for i in range(len(args)): - if not isinstance(args[i], basestring): - continue - if sys.version_info[:2] > (2, 5): - if not isinstance(args[i], unicode): - args[i] = args[i].decode(preferred_encoding, 'replace') - elif isinstance(args[i], unicode): - args[i] = args[i].encode(preferred_encoding, 'replace') - func(*args, **kwargs) - - def log_debug(self, msg, *args, **kwargs): - self.___log(self.__logger.debug, msg, args, kwargs) - - def log_info(self, msg, *args, **kwargs): - self.___log(self.__logger.info, msg, args, kwargs) - - def log_warning(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_warn(self, msg, *args, **kwargs): - self.___log(self.__logger.warning, msg, args, kwargs) - - def log_error(self, msg, *args, **kwargs): - self.___log(self.__logger.error, msg, args, kwargs) - - def log_critical(self, msg, *args, **kwargs): - self.___log(self.__logger.critical, msg, args, kwargs) - - def log_exception(self, msg, *args): - self.___log(self.__logger.exception, msg, args, {}) + def format(self, record): + ln = record.__dict__['levelname'] + col = '' + if ln == 'CRITICAL': + col = terminal_controller.YELLOW + elif ln == 'ERROR': + col = terminal_controller.RED + elif ln in ['WARN', 'WARNING']: + col = terminal_controller.BLUE + elif ln == 'INFO': + col = terminal_controller.GREEN + elif ln == 'DEBUG': + col = terminal_controller.CYAN + record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL + return Formatter.format(self, record) def walk(dir): ''' A nice interface to os.walk ''' diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 14d3c79062..fafe8e5afa 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -242,8 +242,13 @@ class MOBIMetadataWriter(MetadataWriterPlugin): set_metadata(stream, mi) -plugins = [HTML2ZIP] +from calibre.ebooks.epub.input import EPUBInput +from calibre.ebooks.mobi.input import MOBIInput +from calibre.customize.profiles import input_profiles + +plugins = [HTML2ZIP, EPUBInput, MOBIInput] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataWriter')] +plugins += input_profiles \ No newline at end of file diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 36b2781c9d..aa7b0c1dea 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -1,28 +1,30 @@ +from __future__ import with_statement ''' Defines the plugin sytem for conversions. ''' -import re +import re, os, shutil +from lxml import html + +from calibre import CurrentDir from calibre.customize import Plugin - class ConversionOption(object): ''' Class representing conversion options ''' - def __init__(self, name=None, default=None, help=None, long_switch=None, - short_switch=None, choices=None, gui_label=None, - category=None): + def __init__(self, name=None, help=None, long_switch=None, + short_switch=None, choices=None): self.name = name - self.default = default self.help = help self.long_switch = long_switch self.short_switch = short_switch self.choices = choices - self.gui_label = gui_label - self.category = category + + if self.long_switch is None: + self.long_switch = '--'+self.name.replace('_', '-') self.validate_parameters() @@ -32,41 +34,156 @@ class ConversionOption(object): ''' if re.match(r'[a-zA-Z_]([a-zA-Z0-9_])*', self.name) is None: raise ValueError(self.name + ' is not a valid Python identifier') - if not (isinstance(self.default, (int, float, str, unicode)) or \ - self.default is None): + if not self.help: + raise ValueError('You must set the help text') + + +class OptionRecommendation(object): + LOW = 1 + MED = 2 + HIGH = 3 + + def __init__(self, recommeded_value, level=LOW, **kwargs): + ''' + An option recommendation. That is, an option as well as its recommended + value and the level of the recommendation. + ''' + self.level = level + self.recommended_value = recommeded_value + self.option = kwargs.pop('option', None) + if self.option is None: + self.option = ConversionOption(**kwargs) + + self.validate_parameters() + + def validate_parameters(self): + if self.option.choices and self.recommended_value not in \ + self.option.choices: + raise ValueError('Recommended value not in choices') + if not (isinstance(self.recommended_value, (int, float, str, unicode))\ + or self.default is None): raise ValueError(unicode(self.default) + ' is not a string or a number') - if not self.help: - raise ValueError('You must set the help text') + -class ConversionPlugin(Plugin): - +class InputFormatPlugin(Plugin): ''' - The base class for all conversion related plugins. + InputFormatPlugins are responsible for converting a document into + HTML+OPF+CSS+etc. + The results of the conversion *must* be encoded in UTF-8. + The main action happens in :method:`convert`. ''' - #: List of options - #: Each option must be a dictionary. The dictionary can contain several - #: keys defining the option. The ones marked by a * are required, the rest - #: are optional. The keys are:: - #: - #: *'name' : A valid python identifier. - #: *'default' : The default value for this option. - #: *'help' : - #: 'short_switch' : A suggestion for a short form of the command line - #: switch (for example if name is 'title', this - #: could be 't'). It is only used if no prior - #: conversion plugin has claimed it. - options = [] - type = _('Conversion') + type = _('Conversion Input') can_be_disabled = False supported_platforms = ['windows', 'osx', 'linux'] - -class InputFormatPlugin(ConversionPlugin): - #: Set of file types for which this plugin should be run - #: For example: ``set(['lit', 'mobi', 'prc'])`` + #: For example: ``set(['azw', 'mobi', 'prc'])`` file_types = set([]) + #: Options shared by all Input format plugins. Do not override + #: in sub-classes. Use :member:`options` instead. Every option must be an + #: instance of :class:`OptionRecommendation`. + common_options = set([ + OptionRecommendation(name='debug_input', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Save the output from the input plugin to the specified ' + 'directory. Useful if you are unsure at which stage ' + 'of the conversion process a bug is occurring. ' + 'WARNING: This completely deletes the contents of ' + 'the specified directory.') + ), + + OptionRecommendation(name='input_encoding', + recommended_value=None, level=OptionRecommendation.LOW, + help=_('Specify the character encoding of the input document. If ' + 'set this option will override any encoding declared by the ' + 'document itself. Particularly useful for documents that ' + 'do not declare an encoding or that have erroneous ' + 'encoding declarations.') + ), + + ]) + + #: Options to customize the behavior of this plugin. Every option must be an + #: instance of :class:`OptionRecommendation`. + options = set([]) + + def convert(self, stream, options, file_ext, parse_cache, log): + ''' + This method must be implemented in sub-classes. It must return + the path to the created OPF file. All output should be contained in + the current directory. If this plugin creates files outside the current + directory they must be deleted/marked for deletion before this method + returns. + + :param stream: A file like object that contains the input file. + + :param options: Options to customize the conversion process. + Guaranteed to have attributes corresponding + to all the options declared by this plugin. In + addition, it will have a verbose attribute that + takes integral values from zero upwards. Higher numbers + mean be more verbose. Another useful attribute is + ``input_profile`` that is an instance of + :class:`calibre.customize.profiles.InputProfile`. + + :param file_ext: The extension (without the .) of the input file. It + is guaranteed to be one of the `file_types` supported + by this plugin. + + :param parse_cache: A dictionary that maps absolute file paths to + parsed representations of their contents. For + HTML the representation is an lxml element of + the root of the tree. For CSS it is a cssutils + stylesheet. If this plugin parses any of the + output files, it should add them to the cache + so that later stages of the conversion wont + have to re-parse them. If a parsed representation + is in the cache, there is no need to actually + write the file to disk. + + :param log: A :class:`calibre.utils.logging.Log` object. All output + should use this object. + ''' + raise NotImplementedError + + def __call__(self, stream, options, file_ext, parse_cache, log, output_dir): + log('InputFormatPlugin: %s running'%self.name, end=' ') + if hasattr(stream, 'name'): + log('on', stream.name) + + with CurrentDir(output_dir): + for x in os.listdir('.'): + shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) + + + ret = self.convert(stream, options, file_ext, parse_cache, log) + for key in list(parse_cache.keys()): + if os.path.abspath(key) != key: + log.warn(('InputFormatPlugin: %s returned a ' + 'relative path: %s')%(self.name, key) + ) + parse_cache[os.path.abspath(key)] = parse_cache.pop(key) + + if options.debug_input is not None: + options.debug_input = os.path.abspath(options.debug_input) + if not os.path.exists(options.debug_input): + os.makedirs(options.debug_input) + shutil.rmtree(options.debug_input) + for f, obj in parse_cache.items(): + if hasattr(obj, 'cssText'): + raw = obj.cssText + else: + raw = html.tostring(obj, encoding='utf-8', method='xml', + include_meta_content_type=True, pretty_print=True) + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + open(f, 'wb').write(raw) + shutil.copytree('.', options.debug_input) + + + + return ret diff --git a/src/calibre/customize/profiles.py b/src/calibre/customize/profiles.py new file mode 100644 index 0000000000..002f56879f --- /dev/null +++ b/src/calibre/customize/profiles.py @@ -0,0 +1,27 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre.customize import Plugin + +class InputProfile(Plugin): + + author = 'Kovid Goyal' + supported_platforms = set(['windows', 'osx', 'linux']) + can_be_disabled = False + type = _('Input profile') + +# TODO: Add some real information to this profile. All other profiles must +# inherit from this profile and override as needed + + name = 'Default Input Profile' + short_name = 'default' # Used in the CLI so dont spaces etc. in it + description = _('This profile tries to provide sane defaults and is useful ' + 'if you know nothing about the input document.') + +input_profiles = [InputProfile] + + + + diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 95bf01ff6d..1cdafae4f0 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -6,13 +6,14 @@ import os, shutil, traceback, functools, sys from calibre.customize import Plugin, FileTypePlugin, MetadataReaderPlugin, \ MetadataWriterPlugin +from calibre.customize.conversion import InputFormatPlugin +from calibre.customize.profiles import InputProfile from calibre.customize.builtins import plugins as builtin_plugins from calibre.constants import __version__, iswindows, isosx from calibre.ebooks.metadata import MetaInformation from calibre.utils.config import make_config_dir, Config, ConfigProxy, \ plugin_dir, OptionParser - version = tuple([int(x) for x in __version__.split('.')]) platform = 'linux' @@ -70,7 +71,10 @@ _on_import = {} _on_preprocess = {} _on_postprocess = {} - +def input_profiles(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputProfile): + yield plugin def reread_filetype_plugins(): global _on_import @@ -234,6 +238,17 @@ def find_plugin(name): if plugin.name == name: return plugin +def input_format_plugins(): + for plugin in _initialized_plugins: + if isinstance(plugin, InputFormatPlugin): + yield plugin + +def plugin_for_input_format(fmt): + for plugin in input_format_plugins(): + if fmt in plugin.file_types: + return plugin + + def disable_plugin(plugin_or_name): x = getattr(plugin_or_name, 'name', plugin_or_name) plugin = find_plugin(x) diff --git a/src/calibre/ebooks/conversion/__init__.py b/src/calibre/ebooks/conversion/__init__.py new file mode 100644 index 0000000000..384ccfb79c --- /dev/null +++ b/src/calibre/ebooks/conversion/__init__.py @@ -0,0 +1,4 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py new file mode 100644 index 0000000000..ac7490bd39 --- /dev/null +++ b/src/calibre/ebooks/conversion/plumber.py @@ -0,0 +1,30 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + + +from calibre.customize.conversion import OptionRecommendation +from calibre.customize.ui import input_profiles + +pipeline_options = [ + +OptionRecommendation(name='verbose', + recommended_value=0, level=OptionRecommendation.LOW, + short_switch='v', + help=_('Level of verbosity. Specify multiple times for greater ' + 'verbosity.') + ), + + +OptionRecommendation(name='input_profile', + recommended_value='default', level=OptionRecommendation.LOW, + choices=[x.short_name for x in input_profiles()], + help=_('Specify the input profile. The input profile gives the ' + 'conversion system information on how to interpret ' + 'various information in the input document. For ' + 'example resolution dependent lengths (i.e. lengths in ' + 'pixels).') + ), + +] \ No newline at end of file diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py index aa17024d50..989391902b 100644 --- a/src/calibre/ebooks/epub/__init__.py +++ b/src/calibre/ebooks/epub/__init__.py @@ -40,38 +40,6 @@ def rules(stylesheets): if r.type == r.STYLE_RULE: yield r -def decrypt_font(key, path): - raw = open(path, 'rb').read() - crypt = raw[:1024] - key = cycle(iter(key)) - decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) - with open(path, 'wb') as f: - f.write(decrypt) - f.write(raw[1024:]) - -def process_encryption(encfile, opf): - key = None - m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) - if m: - key = m.group(1) - key = list(map(ord, uuid.UUID(key).bytes)) - try: - root = etree.parse(encfile) - for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): - algorithm = em.get('Algorithm', '') - if algorithm != 'http://ns.adobe.com/pdf/enc#RC': - return False - cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] - uri = cr.get('URI') - path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) - if os.path.exists(path): - decrypt_font(key, path) - return True - except: - import traceback - traceback.print_exc() - return False - def initialize_container(path_to_container, opf_name='metadata.opf'): ''' Create an empty EPUB document, with a default skeleton. diff --git a/src/calibre/ebooks/epub/input.py b/src/calibre/ebooks/epub/input.py new file mode 100644 index 0000000000..1b69424a9e --- /dev/null +++ b/src/calibre/ebooks/epub/input.py @@ -0,0 +1,76 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, re, uuid +from itertools import cycle + +from lxml import etree + +from calibre.customize.conversion import InputFormatPlugin + +class EPUBInput(InputFormatPlugin): + + name = 'EPUB Input' + author = 'Kovid Goyal' + description = 'Convert EPUB files (.epub) to HTML' + file_types = set(['epub']) + + @classmethod + def decrypt_font(cls, key, path): + raw = open(path, 'rb').read() + crypt = raw[:1024] + key = cycle(iter(key)) + decrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) + with open(path, 'wb') as f: + f.write(decrypt) + f.write(raw[1024:]) + + @classmethod + def process_ecryption(cls, encfile, opf, log): + key = None + m = re.search(r'(?i)(urn:uuid:[0-9a-f-]+)', open(opf, 'rb').read()) + if m: + key = m.group(1) + key = list(map(ord, uuid.UUID(key).bytes)) + try: + root = etree.parse(encfile) + for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): + algorithm = em.get('Algorithm', '') + if algorithm != 'http://ns.adobe.com/pdf/enc#RC': + return False + cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] + uri = cr.get('URI') + path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) + if os.path.exists(path): + cls.decrypt_font(key, path) + return True + except: + import traceback + traceback.print_exc() + return False + + def convert(self, stream, options, file_ext, parse_cache, log): + from calibre.utils.zipfile import ZipFile + from calibre import walk + from calibre.ebooks import DRMError + zf = ZipFile(stream) + zf.extractall(os.getcwd()) + encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) + opf = None + for f in walk('.'): + if f.lower().endswith('.opf'): + opf = f + break + path = getattr(stream, 'name', 'stream') + + if opf is None: + raise ValueError('%s is not a valid EPUB file'%path) + + if os.path.exists(encfile): + if not self.process_encryption(encfile, opf, log): + raise DRMError(os.path.basename(path)) + + return opf + diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py new file mode 100644 index 0000000000..1ce9950677 --- /dev/null +++ b/src/calibre/ebooks/mobi/input.py @@ -0,0 +1,29 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os + +from calibre.customize.conversion import InputFormatPlugin + +class MOBIInput(InputFormatPlugin): + + name = 'MOBI Input' + author = 'Kovid Goyal' + description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML' + file_types = set(['mobi', 'prc', 'azw']) + + def convert(self, stream, options, file_ext, parse_cache, log): + from calibre.ebooks.mobi.reader import MobiReader + mr = MobiReader(stream, log, options.input_encoding, + options.debug_input) + mr.extract_content(output_dir=os.getcwdu(), parse_cache) + raw = parse_cache.get('calibre_raw_mobi_markup', False) + if raw: + if isinstance(raw, unicode): + raw = raw.encode('utf-8') + open('debug-raw.html', 'wb').write(raw) + + return mr.created_opf_path + diff --git a/src/calibre/ebooks/mobi/reader.py b/src/calibre/ebooks/mobi/reader.py index 2c80cc1c8c..18663660b4 100644 --- a/src/calibre/ebooks/mobi/reader.py +++ b/src/calibre/ebooks/mobi/reader.py @@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal ' Read data from .mobi files ''' -import sys, struct, os, cStringIO, re, functools +import struct, os, cStringIO, re, functools try: from PIL import Image as PILImage @@ -35,8 +35,10 @@ class EXTHHeader(object): pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True + left = self.num_items - for i in range(self.num_items): + while left > 0: + left -= 1 id, size = struct.unpack('>LL', raw[pos:pos+8]) content = raw[pos+8:pos+size] pos += size @@ -76,7 +78,8 @@ class EXTHHeader(object): class BookHeader(object): - def __init__(self, raw, ident): + def __init__(self, raw, ident, user_encoding, log): + self.log = log self.compression_type = raw[:2] self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.encryption_type, = struct.unpack('>H', raw[12:14]) @@ -92,8 +95,8 @@ class BookHeader(object): else: self.ancient = False self.doctype = raw[16:20] - self.length, self.type, self.codepage, self.unique_id, self.version = \ - struct.unpack('>LLLLL', raw[20:40]) + self.length, self.type, self.codepage, self.unique_id, \ + self.version = struct.unpack('>LLLLL', raw[20:40]) try: @@ -102,8 +105,9 @@ class BookHeader(object): 65001 : 'utf-8', }[self.codepage] except (IndexError, KeyError): - print '[WARNING] Unknown codepage %d. Assuming cp-1252'%self.codepage - self.codec = 'cp1252' + self.codec = 'cp1252' if user_encoding is None else user_encoding + log.warn('Unknown codepage %d. Assuming %s'%(self.codepage, + self.codec)) if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length: self.extra_flags = 0 @@ -138,9 +142,24 @@ class MobiReader(object): PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') - def __init__(self, filename_or_stream, verbose=False): - self.verbose = verbose + def __init__(self, filename_or_stream, log, user_encoding=None, debug=None): + self.log = log + self.debug = debug self.embedded_mi = None + self.base_css_rules = ''' + blockquote { margin: 0em 0em 0em 1.25em; text-align: justify } + + p { margin: 0em; text-align: justify } + + .bold { font-weight: bold } + + .italic { font-style: italic } + + .mbp_pagebreak { + page-break-after: always; margin: 0; display: block + } + ''' + self.tag_css_rules = [] if hasattr(filename_or_stream, 'read'): stream = filename_or_stream @@ -177,17 +196,21 @@ class MobiReader(object): self.sections.append((section(i), self.section_headers[i])) - self.book_header = BookHeader(self.sections[0][0], self.ident) + self.book_header = BookHeader(self.sections[0][0], self.ident, + user_encoding, self.log) self.name = self.name.decode(self.book_header.codec, 'replace') - def extract_content(self, output_dir=os.getcwdu()): + def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) if self.book_header.encryption_type != 0: raise DRMError(self.name) processed_records = self.extract_text() + if self.debug is not None: + self.parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() - self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') + self.processed_html = self.processed_html.decode(self.book_header.codec, + 'ignore') for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) e2u = functools.partial(entity_to_unicode, @@ -203,16 +226,10 @@ class MobiReader(object): self.processed_html = \ re.compile('', re.IGNORECASE).sub( '\n\n' - '\n', + '\t\n', self.processed_html) - if self.verbose: - print 'Parsing HTML...' + self.log.debug('Parsing HTML...') root = html.fromstring(self.processed_html) self.upshift_markup(root) guides = root.xpath('//guide') @@ -230,25 +247,24 @@ class MobiReader(object): ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href'] except AttributeError: pass - if self.verbose: - print 'Serializing...' - with open(htmlfile, 'wb') as f: - raw = html.tostring(root, encoding='utf-8', method='xml', - include_meta_content_type=True, pretty_print=True) - raw = raw.replace('', - '\n\n') - f.write(raw) + parse_cache[htmlfile] = root self.htmlfile = htmlfile - if self.book_header.exth is not None or self.embedded_mi is not None: - if self.verbose: - print 'Creating OPF...' - ncx = cStringIO.StringIO() - opf = self.create_opf(htmlfile, guide, root) - opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx) - ncx = ncx.getvalue() - if ncx: - open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + self.log.debug('Creating OPF...') + ncx = cStringIO.StringIO() + opf = self.create_opf(htmlfile, guide, root) + self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf' + opf.render(open(self.created_opf_path, 'wb'), ncx) + ncx = ncx.getvalue() + if ncx: + open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx) + + with open('styles.css', 'wb') as s: + s.write(self.base_css_rules+'\n\n') + for rule in self.tag_css_rules: + if isinstance(rule, unicode): + rule = rule.encode('utf-8') + s.write(rule+'\n\n') def read_embedded_metadata(self, root, elem, guide): raw = ''+html.tostring(elem, encoding='utf-8')+'' @@ -277,8 +293,7 @@ class MobiReader(object): def cleanup_html(self): - if self.verbose: - print 'Cleaning up HTML...' + self.log.debug('Cleaning up HTML...') self.processed_html = re.sub(r'
', '', self.processed_html) if self.book_header.ancient and '')+'' @@ -286,8 +301,7 @@ class MobiReader(object): self.processed_html = self.processed_html.replace('> <', '>\n<') def upshift_markup(self, root): - if self.verbose: - print 'Converting style information to CSS...' + self.log.debug('Converting style information to CSS...') size_map = { 'xx-small' : '0.5', 'x-small' : '1', @@ -298,7 +312,7 @@ class MobiReader(object): 'xx-large' : '6', } mobi_version = self.book_header.mobi_version - for tag in root.iter(etree.Element): + for i, tag in enumerate(root.iter(etree.Element)): if tag.tag in ('country-region', 'place', 'placetype', 'placename', 'state', 'city'): tag.tag = 'span' @@ -352,8 +366,7 @@ class MobiReader(object): elif tag.tag == 'pre': if not tag.text: tag.tag = 'div' - if styles: - attrib['style'] = '; '.join(styles) + if 'filepos-id' in attrib: attrib['id'] = attrib.pop('filepos-id') if 'filepos' in attrib: @@ -362,15 +375,24 @@ class MobiReader(object): attrib['href'] = "#filepos%d" % int(filepos) except ValueError: pass + + if styles: + attrib['id'] = attrib.get('id', 'calibre_mr_gid%d'%i) + self.tag_css_rules.append('#%s {%s}'%(attrib['id'], + '; '.join(styles))) + def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) + if mi is None: + mi = MetaInformation(self.title, [_('Unknown')]) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1) elif mi.cover is not None: opf.cover = mi.cover - manifest = [(htmlfile, 'text/x-oeb1-document')] + manifest = [(htmlfile, 'text/x-oeb1-document'), + (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) for i in getattr(self, 'image_names', []): manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg')) @@ -441,8 +463,7 @@ class MobiReader(object): return data[:len(data)-trail_size] def extract_text(self): - if self.verbose: - print 'Extracting text...' + self.log.debug('Extracting text...') text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)] processed_records = list(range(0, self.book_header.records+1)) @@ -472,12 +493,11 @@ class MobiReader(object): def replace_page_breaks(self): self.processed_html = self.PAGE_BREAK_PAT.sub( - '
', + '
', self.processed_html) def add_anchors(self): - if self.verbose: - print 'Adding anchors...' + self.log.debug('Adding anchors...') positions = set([]) link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) @@ -507,8 +527,7 @@ class MobiReader(object): def extract_images(self, processed_records, output_dir): - if self.verbose: - print 'Extracting images...' + self.log.debug('Extracting images...') output_dir = os.path.abspath(os.path.join(output_dir, 'images')) if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -535,14 +554,17 @@ class MobiReader(object): im.convert('RGB').save(open(path, 'wb'), format='JPEG') def get_metadata(stream): - mr = MobiReader(stream) + from calibre.utils.logging import Log + log = Log() + mr = MobiReader(stream, log) if mr.book_header.exth is None: mi = MetaInformation(mr.name, [_('Unknown')]) else: mi = mr.create_opf('dummy.html') try: if hasattr(mr.book_header.exth, 'cover_offset'): - cover_index = mr.book_header.first_image_index + mr.book_header.exth.cover_offset + cover_index = mr.book_header.first_image_index + \ + mr.book_header.exth.cover_offset data = mr.sections[int(cover_index)][0] else: data = mr.sections[mr.book_header.first_image_index][0] @@ -552,42 +574,7 @@ def get_metadata(stream): im.convert('RGBA').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) except: - import traceback - traceback.print_exc() + log.exception() return mi -def option_parser(): - from calibre.utils.config import OptionParser - parser = OptionParser(usage=_('%prog [options] myebook.mobi')) - parser.add_option('-o', '--output-dir', default='.', - help=_('Output directory. Defaults to current directory.')) - parser.add_option('-v', '--verbose', default=False, action='store_true', - help='Useful for debugging.') - return parser - - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - return 1 - - mr = MobiReader(args[1], verbose=opts.verbose) - opts.output_dir = os.path.abspath(opts.output_dir) - mr.extract_content(opts.output_dir) - if opts.verbose: - oname = os.path.join(opts.output_dir, 'debug-raw.html') - dat = mr.mobi_html - if isinstance(dat, unicode): - dat = dat.encode('utf-8') - open(oname, 'wb').write(dat) - print _('Raw MOBI HTML saved in'), oname - - print _('OEB ebook created in'), opts.output_dir - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py new file mode 100644 index 0000000000..ae2e1a792b --- /dev/null +++ b/src/calibre/utils/logging.py @@ -0,0 +1,92 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +'A simplified logging system' + +DEBUG = 0 +INFO = 1 +WARN = 2 +ERROR = 3 + +import sys, traceback +from functools import partial + +from calibre import prints +from calibre.utils.terminfo import TerminalController + +class ANSIStream: + + def __init__(self, stream=sys.stdout): + self.stream = stream + tc = TerminalController(stream) + self.color = { + DEBUG: tc.GREEN, + INFO:'', + WARN: tc.YELLOW, + ERROR: tc.RED + } + self.normal = tc.NORMAL + + def prints(self, level, *args, **kwargs): + self.stream.write(self.color[level]) + kwargs['file'] = self.stream + prints(*args, **kwargs) + self.stream.write(self.normal) + + def flush(self): + self.stream.flush() + +class HTMLStream: + + def __init__(self, stream=sys.stdout): + self.stream = stream + self.color = { + DEBUG: '', + INFO:'', + WARN: '', + ERROR: '' + } + self.normal = '' + + def prints(self, level, *args, **kwargs): + self.stream.write(self.color[level]) + kwargs['file'] = self.stream + prints(*args, **kwargs) + self.stream.write(self.normal) + + def flush(self): + self.stream.flush() + +class Log(object): + + DEBUG = DEBUG + INFO = INFO + WARN = WARN + ERROR = ERROR + + def __init__(self, level=INFO): + self.filter_level = level + default_output = ANSIStream() + self.outputs = [default_output] + + self.debug = partial(self.prints, DEBUG) + self.info = partial(self.prints, INFO) + self.warn = self.warning = partial(self.prints, WARN) + self.error = partial(self.prints, ERROR) + + + def prints(self, level, *args, **kwargs): + if level < self.filter_level: + return + for output in self.outputs: + output.prints(level, *args, **kwargs) + + def exception(self, *args, **kwargs): + limit = kwargs.pop('limit', None) + self.prints(ERROR, *args, **kwargs) + self.prints(DEBUG, traceback.format_exc(limit)) + + def __call__(self, *args, **kwargs): + self.prints(INFO, *args, **kwargs) \ No newline at end of file diff --git a/src/calibre/utils/terminfo.py b/src/calibre/utils/terminfo.py index 075c0e694d..fd394cbfe9 100644 --- a/src/calibre/utils/terminfo.py +++ b/src/calibre/utils/terminfo.py @@ -33,7 +33,7 @@ class TerminalController: >>> term = TerminalController() >>> if term.CLEAR_SCREEN: - ... print 'This terminal supports clearning the screen.' + ... print 'This terminal supports clearing the screen.' Finally, if the width and height of the terminal are known, then they will be stored in the `COLS` and `LINES` attributes.