diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 02fc98d9df..dd42434101 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -7,21 +7,25 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' \ 'and Marshall T. Vandegrift ' -import sys, struct, cStringIO, os +import sys, struct, os import functools import re from urlparse import urldefrag +from cStringIO import StringIO from urllib import unquote as urlunquote from lxml import etree from calibre.ebooks.lit import LitError from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP import calibre.ebooks.lit.mssha1 as mssha1 from calibre.ebooks.oeb.base import urlnormalize +from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks import DRMError from calibre import plugins lzx, lxzerror = plugins['lzx'] msdes, msdeserror = plugins['msdes'] +__all__ = ["LitReader"] + XML_DECL = """ """ OPF_DECL = """ @@ -109,6 +113,9 @@ def consume_sized_utf8_string(bytes, zpad=False): pos += 1 return u''.join(result), bytes[pos:] +def encode(string): + return unicode(string).encode('ascii', 'xmlcharrefreplace') + class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') @@ -119,13 +126,13 @@ class UnBinary(object): def __init__(self, bin, path, manifest={}, map=HTML_MAP): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map - self.opf = map is OPF_MAP - self.bin = bin + self.is_html = map is HTML_MAP self.dir = os.path.dirname(path) - self.buf = cStringIO.StringIO() - self.binary_to_text() - self.raw = self.buf.getvalue().lstrip().decode('utf-8') + buf = StringIO() + self.binary_to_text(bin, buf) + self.raw = buf.getvalue().lstrip() self.escape_reserved() + self._tree = None def escape_reserved(self): raw = self.raw @@ -152,18 +159,20 @@ class UnBinary(object): return '/'.join(relpath) def __unicode__(self): + return self.raw.decode('utf-8') + + def __str__(self): return self.raw - - def binary_to_text(self, base=0, depth=0): + + def binary_to_text(self, bin, buf, index=0, depth=0): tag_name = current_map = None dynamic_tag = errors = 0 in_censorship = is_goingdown = False state = 'text' - index = base flags = 0 - while index < len(self.bin): - c, index = read_utf8_char(self.bin, index) + while index < len(bin): + c, index = read_utf8_char(bin, index) oc = ord(c) if state == 'text': @@ -176,7 +185,7 @@ class UnBinary(object): c = '>>' elif c == '<': c = '<<' - self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) elif state == 'get flags': if oc == 0: @@ -189,7 +198,7 @@ class UnBinary(object): state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc - self.buf.write('<') + buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: @@ -206,7 +215,7 @@ class UnBinary(object): tag_name = '?'+unichr(tag)+'?' current_map = self.tag_to_attr_map[tag] print 'WARNING: tag %s unknown' % unichr(tag) - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag') @@ -218,15 +227,14 @@ class UnBinary(object): if not is_goingdown: tag_name = None dynamic_tag = 0 - self.buf.write(' />') + buf.write(' />') else: - self.buf.write('>') - index = self.binary_to_text(base=index, depth=depth+1) + buf.write('>') + index = self.binary_to_text(bin, buf, index, depth+1) is_goingdown = False if not tag_name: raise LitError('Tag ends before it begins.') - self.buf.write(u''.join( - ('')).encode('utf-8')) + buf.write(encode(u''.join(('')))) dynamic_tag = 0 tag_name = None state = 'text' @@ -246,7 +254,7 @@ class UnBinary(object): in_censorship = True state = 'get value length' continue - self.buf.write(' ' + unicode(attr).encode('utf-8') + '=') + buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: @@ -254,40 +262,39 @@ class UnBinary(object): elif state == 'get value length': if not in_censorship: - self.buf.write('"') + buf.write('"') count = oc - 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue - if count < 0 or count > (len(self.bin) - index): + if count < 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: - self.buf.write('%s"' % (oc - 1)) + buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(c.encode( - 'ascii', 'xmlcharrefreplace')) + buf.write(encode(c)) count -= 1 if count == 0: if not in_censorship: - self.buf.write('"') + buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 - if count <= 0 or count > len(self.bin)-index: + if count <= 0 or count > len(bin)-index: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' @@ -297,26 +304,26 @@ class UnBinary(object): tag_name += c count -= 1 if count == 0: - self.buf.write(unicode(tag_name).encode('utf-8')) + buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) - self.buf.write(' ') + buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': - self.buf.write(unicode(c).encode('utf-8')) + buf.write(encode(c)) count -= 1 if count == 0: - self.buf.write('=') + buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 - if count <= 0 or count > (len(self.bin) - index): + if count <= 0 or count > (len(bin) - index): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' @@ -330,10 +337,11 @@ class UnBinary(object): if frag: path = '#'.join((path, frag)) path = urlnormalize(path) - self.buf.write((u'"%s"' % path).encode('utf-8')) + buf.write(encode(u'"%s"' % path)) state = 'get attr' return index + class DirectoryEntry(object): def __init__(self, name, section, offset, size): self.name = name @@ -348,6 +356,7 @@ class DirectoryEntry(object): def __str__(self): return repr(self) + class ManifestItem(object): def __init__(self, original, internal, mime_type, offset, root, state): self.original = original @@ -375,65 +384,87 @@ class ManifestItem(object): % (self.internal, self.path, self.mime_type, self.offset, self.root, self.state) + def preserve(function): def wrapper(self, *args, **kwargs): - opos = self._stream.tell() + opos = self.stream.tell() try: return function(self, *args, **kwargs) finally: - self._stream.seek(opos) + self.stream.seek(opos) functools.update_wrapper(wrapper, function) return wrapper -class LitReader(object): +class LitFile(object): PIECE_SIZE = 16 - XML_PARSER = etree.XMLParser( - recover=True, resolve_entities=False) + + def __init__(self, filename_or_stream): + if hasattr(filename_or_stream, 'read'): + self.stream = filename_or_stream + else: + self.stream = open(filename_or_stream, 'rb') + try: + self.opf_path = os.path.splitext( + os.path.basename(self.stream.name))[0] + '.opf' + except AttributeError: + self.opf_path = 'content.opf' + if self.magic != 'ITOLITLS': + raise LitError('Not a valid LIT file') + if self.version != 1: + raise LitError('Unknown LIT version %d' % (self.version,)) + self.read_secondary_header() + self.read_header_pieces() + self.read_section_names() + self.read_manifest() + self.read_drm() + + def warn(self, msg): + print "WARNING: %s" % (msg,) def magic(): @preserve def fget(self): - self._stream.seek(0) - return self._stream.read(8) + self.stream.seek(0) + return self.stream.read(8) return property(fget=fget) magic = magic() def version(): def fget(self): - self._stream.seek(8) - return u32(self._stream.read(4)) + self.stream.seek(8) + return u32(self.stream.read(4)) return property(fget=fget) version = version() def hdr_len(): @preserve def fget(self): - self._stream.seek(12) - return int32(self._stream.read(4)) + self.stream.seek(12) + return int32(self.stream.read(4)) return property(fget=fget) hdr_len = hdr_len() def num_pieces(): @preserve def fget(self): - self._stream.seek(16) - return int32(self._stream.read(4)) + self.stream.seek(16) + return int32(self.stream.read(4)) return property(fget=fget) num_pieces = num_pieces() def sec_hdr_len(): @preserve def fget(self): - self._stream.seek(20) - return int32(self._stream.read(4)) + self.stream.seek(20) + return int32(self.stream.read(4)) return property(fget=fget) sec_hdr_len = sec_hdr_len() def guid(): @preserve def fget(self): - self._stream.seek(24) - return self._stream.read(16) + self.stream.seek(24) + return self.stream.read(16) return property(fget=fget) guid = guid() @@ -443,44 +474,27 @@ class LitReader(object): size = self.hdr_len \ + (self.num_pieces * self.PIECE_SIZE) \ + self.sec_hdr_len - self._stream.seek(0) - return self._stream.read(size) + self.stream.seek(0) + return self.stream.read(size) return property(fget=fget) header = header() - def __init__(self, filename_or_stream): - if hasattr(filename_or_stream, 'read'): - self._stream = filename_or_stream - else: - self._stream = open(filename_or_stream, 'rb') - if self.magic != 'ITOLITLS': - raise LitError('Not a valid LIT file') - if self.version != 1: - raise LitError('Unknown LIT version %d' % (self.version,)) - self.entries = {} - self._read_secondary_header() - self._read_header_pieces() - self._read_section_names() - self._read_manifest() - self._read_meta() - self._read_drm() - @preserve def __len__(self): - self._stream.seek(0, 2) - return self._stream.tell() + self.stream.seek(0, 2) + return self.stream.tell() @preserve - def _read_raw(self, offset, size): - self._stream.seek(offset) - return self._stream.read(size) + def read_raw(self, offset, size): + self.stream.seek(offset) + return self.stream.read(size) - def _read_content(self, offset, size): - return self._read_raw(self.content_offset + offset, size) + def read_content(self, offset, size): + return self.read_raw(self.content_offset + offset, size) - def _read_secondary_header(self): + def read_secondary_header(self): offset = self.hdr_len + (self.num_pieces * self.PIECE_SIZE) - bytes = self._read_raw(offset, self.sec_hdr_len) + bytes = self.read_raw(offset, self.sec_hdr_len) offset = int32(bytes[4:]) while offset < len(bytes): blocktype = bytes[offset:offset+4] @@ -508,21 +522,21 @@ class LitReader(object): if not hasattr(self, 'content_offset'): raise LitError('Could not figure out the content offset') - def _read_header_pieces(self): + def read_header_pieces(self): src = self.header[self.hdr_len:] for i in xrange(self.num_pieces): piece = src[i * self.PIECE_SIZE:(i + 1) * self.PIECE_SIZE] if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: raise LitError('Piece %s has 64bit value' % repr(piece)) offset, size = u32(piece), int32(piece[8:]) - piece = self._read_raw(offset, size) + piece = self.read_raw(offset, size) if i == 0: continue # Dont need this piece elif i == 1: if u32(piece[8:]) != self.entry_chunklen or \ u32(piece[12:]) != self.entry_unknown: raise LitError('Secondary header does not match piece') - self._read_directory(piece) + self.read_directory(piece) elif i == 2: if u32(piece[8:]) != self.count_chunklen or \ u32(piece[12:]) != self.count_unknown: @@ -533,12 +547,13 @@ class LitReader(object): elif i == 4: self.piece4_guid = piece - def _read_directory(self, piece): + def read_directory(self, piece): if not piece.startswith('IFCM'): raise LitError('Header piece #1 is not main directory.') chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) if (32 + (num_chunks * chunk_size)) != len(piece): - raise LitError('IFCM HEADER has incorrect length') + raise LitError('IFCM header has incorrect length') + self.entries = {} for i in xrange(num_chunks): offset = 32 + (i * chunk_size) chunk = piece[offset:offset + chunk_size] @@ -572,17 +587,17 @@ class LitReader(object): entry = DirectoryEntry(name, section, offset, size) self.entries[name] = entry - def _read_section_names(self): + def read_section_names(self): if '::DataSpace/NameList' not in self.entries: raise LitError('Lit file does not have a valid NameList') raw = self.get_file('::DataSpace/NameList') if len(raw) < 4: raise LitError('Invalid Namelist section') pos = 4 - self.num_sections = u16(raw[2:pos]) - self.section_names = [""]*self.num_sections - self.section_data = [None]*self.num_sections - for section in xrange(self.num_sections): + num_sections = u16(raw[2:pos]) + self.section_names = [""] * num_sections + self.section_data = [None] * num_sections + for section in xrange(num_sections): size = u16(raw[pos:pos+2]) pos += 2 size = size*2 + 2 @@ -592,11 +607,12 @@ class LitReader(object): raw[pos:pos+size].decode('utf-16-le').rstrip('\000') pos += size - def _read_manifest(self): + def read_manifest(self): if '/manifest' not in self.entries: raise LitError('Lit file does not have a valid manifest') raw = self.get_file('/manifest') self.manifest = {} + self.paths = {self.opf_path: None} while raw: slen, raw = ord(raw[0]), raw[1:] if slen == 0: break @@ -637,28 +653,9 @@ class LitReader(object): for item in mlist: if item.path[0] == '/': item.path = os.path.basename(item.path) + self.paths[item.path] = item - def _pretty_print(self, xml): - f = cStringIO.StringIO(xml.encode('utf-8')) - doc = etree.parse(f, parser=self.XML_PARSER) - pretty = etree.tostring(doc, encoding='ascii', pretty_print=True) - return XML_DECL + unicode(pretty) - - def _read_meta(self): - path = 'content.opf' - raw = self.get_file('/meta') - xml = OPF_DECL - try: - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - except LitError: - if 'PENGUIN group' not in raw: raise - print "WARNING: attempting PENGUIN malformed OPF fix" - raw = raw.replace( - 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) - xml += unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) - self.meta = xml - - def _read_drm(self): + def read_drm(self): self.drmlevel = 0 if '/DRMStorage/Licenses/EUL' in self.entries: self.drmlevel = 5 @@ -669,7 +666,7 @@ class LitReader(object): else: return if self.drmlevel < 5: - msdes.deskey(self._calculate_deskey(), msdes.DE1) + msdes.deskey(self.calculate_deskey(), msdes.DE1) bookkey = msdes.des(self.get_file('/DRMStorage/DRMSealed')) if bookkey[0] != '\000': raise LitError('Unable to decrypt title key!') @@ -677,7 +674,7 @@ class LitReader(object): else: raise DRMError("Cannot access DRM-protected book") - def _calculate_deskey(self): + def calculate_deskey(self): hashfiles = ['/meta', '/DRMStorage/DRMSource'] if self.drmlevel == 3: hashfiles.append('/DRMStorage/DRMBookplate') @@ -701,18 +698,18 @@ class LitReader(object): def get_file(self, name): entry = self.entries[name] if entry.section == 0: - return self._read_content(entry.offset, entry.size) + return self.read_content(entry.offset, entry.size) section = self.get_section(entry.section) return section[entry.offset:entry.offset+entry.size] def get_section(self, section): data = self.section_data[section] if not data: - data = self._get_section(section) + data = self.get_section_uncached(section) self.section_data[section] = data return data - def _get_section(self, section): + def get_section_uncached(self, section): name = self.section_names[section] path = '::DataSpace/Storage/' + name transform = self.get_file(path + '/Transform/List') @@ -724,29 +721,29 @@ class LitReader(object): raise LitError("ControlData is too short") guid = msguid(transform) if guid == DESENCRYPT_GUID: - content = self._decrypt(content) + content = self.decrypt(content) control = control[csize:] elif guid == LZXCOMPRESS_GUID: reset_table = self.get_file( '/'.join(('::DataSpace/Storage', name, 'Transform', LZXCOMPRESS_GUID, 'InstanceData/ResetTable'))) - content = self._decompress(content, control, reset_table) + content = self.decompress(content, control, reset_table) control = control[csize:] else: raise LitError("Unrecognized transform: %s." % repr(guid)) transform = transform[16:] return content - def _decrypt(self, content): + def decrypt(self, content): length = len(content) extra = length & 0x7 if extra > 0: - self._warn("content length not a multiple of block size") + self.warn("content length not a multiple of block size") content += "\0" * (8 - extra) msdes.deskey(self.bookkey, msdes.DE1) return msdes.des(content) - def _decompress(self, content, control, reset_table): + def decompress(self, content, control, reset_table): if len(control) < 32 or control[CONTROL_TAG:CONTROL_TAG+4] != "LZXC": raise LitError("Invalid ControlData tag value") if len(reset_table) < (RESET_INTERVAL + 8): @@ -787,7 +784,7 @@ class LitReader(object): result.append( lzx.decompress(content[base:size], window_bytes)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining -= window_bytes base = size accum += int32(reset_table[RESET_INTERVAL:]) @@ -797,80 +794,56 @@ class LitReader(object): try: result.append(lzx.decompress(content[base:], bytes_remaining)) except lzx.LZXError: - self._warn("LZX decompression error; skipping chunk") + self.warn("LZX decompression error; skipping chunk") bytes_remaining = 0 if bytes_remaining > 0: raise LitError("Failed to completely decompress section") return ''.join(result) - def get_entry_content(self, entry, pretty_print=False): - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - path = entry.path - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - content = decl + unicode(UnBinary(raw, path, self.manifest, map)) - if pretty_print: - content = self._pretty_print(content) - content = content.encode('utf-8') + +class LitContainer(object): + + def __init__(self, filename_or_stream): + self._litfile = LitFile(filename_or_stream) + + def namelist(self): + return self._litfile.paths.keys() + + def exists(self, name): + return urlunquote(name) in self._litfile.paths + + def read(self, name): + entry = self._litfile.paths[urlunquote(name)] if name else None + if entry is None: + content = OPF_DECL + self._read_meta() + elif 'spine' in entry.state: + internal = '/'.join(('/data', entry.internal, 'content')) + raw = self._litfile.get_file(internal) + unbin = UnBinary(raw, name, self._litfile.manifest, HTML_MAP) + content = HTML_DECL + str(unbin) else: - name = '/'.join(('/data', entry.internal)) - content = self.get_file(name) + internal = '/'.join(('/data', entry.internal)) + content = self._litfile.get_file(internal) return content - - def extract_content(self, output_dir=os.getcwdu(), pretty_print=False): - output_dir = os.path.abspath(output_dir) + + def _read_meta(self): + path = 'content.opf' + raw = self._litfile.get_file('/meta') try: - opf_path = os.path.splitext( - os.path.basename(self._stream.name))[0] + '.opf' - except AttributeError: - opf_path = 'content.opf' - opf_path = os.path.join(output_dir, opf_path) - self._ensure_dir(opf_path) - with open(opf_path, 'wb') as f: - xml = self.meta - if pretty_print: - xml = self._pretty_print(xml) - f.write(xml.encode('utf-8')) - for entry in self.manifest.values(): - path = os.path.join(output_dir, entry.path) - self._ensure_dir(path) - with open(path, 'wb') as f: - f.write(self.get_entry_content(entry, pretty_print)) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + except LitError: + if 'PENGUIN group' not in raw: raise + print "WARNING: attempting PENGUIN malformed OPF fix" + raw = raw.replace( + 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) + unbin = UnBinary(raw, path, self._litfile.manifest, OPF_MAP) + return str(unbin) - def _ensure_dir(self, path): - dir = os.path.dirname(path) - if not os.path.isdir(dir): - os.makedirs(dir) - def _warn(self, msg): - print "WARNING: %s" % (msg,) +class LitReader(OEBReader): + Container = LitContainer + DEFAULT_PROFILE = 'MSReader' -def option_parser(): - from calibre.utils.config import OptionParser - parser = OptionParser(usage=_('%prog [options] LITFILE')) - parser.add_option( - '-o', '--output-dir', default='.', - help=_('Output directory. Defaults to current directory.')) - parser.add_option( - '-p', '--pretty-print', default=False, action='store_true', - help=_('Legibly format extracted markup. May modify meaningful whitespace.')) - parser.add_option( - '--verbose', default=False, action='store_true', - help=_('Useful for debugging.')) - return parser - -def main(args=sys.argv): - parser = option_parser() - opts, args = parser.parse_args(args) - if len(args) != 2: - parser.print_help() - return 1 - lr = LitReader(args[1]) - lr.extract_content(opts.output_dir, opts.pretty_print) - print _('OEB ebook created in'), opts.output_dir - return 0 try: import psyco @@ -878,6 +851,3 @@ try: psyco.bind(UnBinary.binary_to_text) except ImportError: pass - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/oeb/factory.py b/src/calibre/ebooks/oeb/factory.py index dcb0942e85..1ce33a4f00 100644 --- a/src/calibre/ebooks/oeb/factory.py +++ b/src/calibre/ebooks/oeb/factory.py @@ -1,3 +1,6 @@ +''' +Registry associating file extensions with Reader classes. +''' from __future__ import with_statement __license__ = 'GPL v3' @@ -6,11 +9,13 @@ __copyright__ = '2008, Marshall T. Vandegrift ' import os from calibre.ebooks.oeb.base import OEBError from calibre.ebooks.oeb.reader import OEBReader +from calibre.ebooks.lit.reader import LitReader __all__ = ['get_reader'] READER_REGISTRY = { '.opf': OEBReader, + '.lit': LitReader, } def ReaderFactory(path): diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index 2d22ff0cd2..aa23ce1e96 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -1,3 +1,6 @@ +""" +Container-/OPF-based input OEBBook reader. +""" from __future__ import with_statement __license__ = 'GPL v3' diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index e55db670d6..c84db30c98 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -1,3 +1,6 @@ +''' +Directory output OEBBook writer. +''' from __future__ import with_statement __license__ = 'GPL v3'