diff --git a/setup.py b/setup.py index 1586114c89..76ee166d4f 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ entry_points = { 'lrf-meta = libprs500.ebooks.lrf.meta:main', 'rtf-meta = libprs500.ebooks.metadata.rtf:main', 'pdf-meta = libprs500.ebooks.metadata.pdf:main', + 'lit-meta = libprs500.ebooks.metadata.lit:main', 'txt2lrf = libprs500.ebooks.lrf.txt.convert_from:main', 'html2lrf = libprs500.ebooks.lrf.html.convert_from:main', 'markdown-libprs500 = libprs500.ebooks.markdown.markdown:main', diff --git a/src/libprs500/ebooks/metadata/lit.py b/src/libprs500/ebooks/metadata/lit.py new file mode 100644 index 0000000000..00ce3a5054 --- /dev/null +++ b/src/libprs500/ebooks/metadata/lit.py @@ -0,0 +1,711 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Support for reading the metadata from a lit file. +''' + +import sys, struct, cStringIO, os + +from libprs500.ebooks.metadata import MetaInformation +from libprs500.ebooks.metadata.opf import OPFReader + +OPF_ATTR_MAP = [ + None, + "href", + "%never-used", + "%guid", + "%minimum_level", + "%attr5", + "id", + "href", + "media-type", + "fallback", + "idref", + "xmlns:dc", + "xmlns:oebpackage", + "role", + "file-as", + "event", + "scheme", + "title", + "type", + "unique-identifier", + "name", + "content", + "xml:lang", + ] + +OPF_TAG_MAP = [ + None, + "package", + "dc:Title", + "dc:Creator", + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + "manifest", + "item", + "spine", + "itemref", + "metadata", + "dc-metadata", + "dc:Subject", + "dc:Description", + "dc:Publisher", + "dc:Contributor", + "dc:Date", + "dc:Type", + "dc:Format", + "dc:Identifier", + "dc:Source", + "dc:Language", + "dc:Relation", + "dc:Coverage", + "dc:Rights", + "x-metadata", + "meta", + "tours", + "tour", + "site", + "guide", + "reference", + None, + ] + +class DirectoryEntry(object): + def __init__(self, name, section, offset, size): + self.name = name + self.section = section + self.offset = offset + self.size = size + + def __str__(self): + return '%s\n\tSection: %d\n\tOffset: %d\n\tSize: %d'%(self.name, + self.section, self.offset, self.size) + +class LitReadError(Exception): + pass + +def u32(bytes): + b = struct.unpack('BBBB', bytes[:4]) + return b[0] + (b[1] << 8) + (b[2] << 16) + (b[3] << 32) + +def u16(bytes): + b = struct.unpack('BB', bytes[:2]) + return b[0] + (b[1] << 8) + +def int32(bytes): + return u32(bytes)&0x7FFFFFFF + +def encint(bytes, remaining): + pos, val = 0, 0 + while remaining > 0: + b = ord(bytes[pos]) + pos += 1 + remaining -= 1 + val <<= 7 + val |= (b & 0x7f) + if b & 0x80 == 0: break + return val, bytes[pos:], remaining + +def read_utf8_char(bytes, pos): + c = ord(bytes[pos]) + mask = 0x80 + if (c & mask): + elsize = 0 + while c & mask: + mask >>= 1 + elsize += 1 + if (mask <= 1) or (mask == 0x40): + raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos]))) + else: + elsize = 1 + + + if elsize > 1: + if elsize + pos > len(bytes): + raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos]))) + c &= (mask - 1) + for i in range(1, elsize): + b = ord(bytes[pos+i]) + if (b & 0xC0) != 0x80: + raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos:pos+i]))) + c = (c << 6) | (b & 0x3F) + return unichr(c), pos+elsize + +FLAG_OPENING = 1 +FLAG_CLOSING = 2 +FLAG_BLOCK = 4 +FLAG_HEAD = 8 +FLAG_ATOM = 16 + +class UnBinary(object): + pending_indent = 0 + lingering_space = 0 + was_in_text = 0 + buf = cStringIO.StringIO() + + def __init__(self, bin, manifest, attr_map=OPF_ATTR_MAP, tag_map=OPF_TAG_MAP, + tag_to_attr_map=[[] for i in range(43)]): + self.manifest = manifest + self.attr_map = attr_map + self.tag_map = tag_map + self.tag_to_attr_map = tag_to_attr_map + self.opf = self.attr_map is OPF_ATTR_MAP + self.bin = bin + self.binary_to_text() + + def write_spaces(self, depth): + self.buf.write(u' '.join(u'' for i in range(depth))) + + + def item_path(self, internal_id): + for i in self.manifest: + if i == internal_id: + return i.path + raise LitReadError('Could not find item %s'%(internal_id,)) + + def binary_to_text(self, base=0, depth=0): + space_enabled, saved_space_enabled = 1, 0 + was_indented, is_goingdown = 0, 0 + tag_name = current_map = None + dynamic_tag = errors = in_censorship = 0 + + state = 'text' + index = base + flags = 0 + + while index < len(self.bin): + c, index = read_utf8_char(self.bin, index) + if state == 'text': + if ord(c) == 0: + state = 'get flags' + continue + if (not self.was_in_text) or space_enabled: + space_enabled = 0; + if c in (' ', '\t', '\n', '\r'): + space_enabled += 1 + else: + self.was_in_text = 1 + if c == '\v': + c = '\n' + pending_indent = 0 + self.buf.write(c) + elif state == 'get flags': + if ord(c) == 0: + state = 'text' + continue + flags = ord(c) + state = 'get tag' + elif state == 'get tag': + state = 'text' if ord(c) == 0 else 'get attr' + if flags & FLAG_OPENING: + if space_enabled and ((not self.was_in_text) or (flags &(FLAG_BLOCK|FLAG_HEAD))): + self.pending_indent += 1 + if self.pending_indent or self.opf: + was_indented += 1 + self.buf.write(u'\n') + self.write_spaces(depth) + pending_indent = 0 + if (flags & FLAG_HEAD) or (flags & FLAG_BLOCK) or \ + self.opf or depth == 0: + pending_indent = 1 + tag = ord(c) + self.buf.write('<') + if not (flags & FLAG_CLOSING): + is_goingdown = 1 + if tag == 0x8000: + state = 'get custom length' + continue + if flags & FLAG_ATOM: + raise LitReadError('TODO: Atoms not yet implemented') + elif tag < len(self.tag_map): + tag_name = self.tag_map[tag] + current_map = self.tag_to_attr_map[tag] + else: + dynamic_tag += 1 + errors += 1 + tag_name = '?'+unichr(tag)+'?' + current_map = self.tag_to_attr_map[tag] + print 'WARNING: tag %s unknown'%(unichr(tag),) + + self.buf.write(unicode(tag_name)) + elif flags & FLAG_CLOSING: + if depth == 0: + raise LitReadError('Extra closing tag') + self.lingering_space = space_enabled + return index + elif state == 'get attr': + in_censorship = 0 + if ord(c) == 0: + if not is_goingdown: + tag_name = None + dynamic_tag = 0 + self.buf.write(u' />') + else: + self.buf.write(u'>') + if not self.opf and (flags & (FLAG_BLOCK|FLAG_HEAD)): + pending_indent += 1 + index = self.binary_to_text(base=index, depth=depth+1) + is_goingdown = 0 + if not tag_name: + raise LitReadError('Tag ends before it begins.') + saved_space_enabled = space_enabled + space_enabled = self.lingering_space + if space_enabled and was_indented and not self.was_in_text: + self.buf.write(u'\n') + self.write_spaces(depth) + self.buf.write(u'') + if (space_enabled and self.opf) or (flags & (FLAG_BLOCK|FLAG_HEAD)): + self.pending_indent += 1 + dynamic_tag = 0 + tag_name = None + space_enabled = saved_space_enabled + + self.was_in_text = 0 + state = 'text' + else: + if ord(c) == 0x8000: + state = 'get attr length' + continue + attr = None + if ord(c) < len(current_map) and current_map[ord(c)]: + attr = current_map[ord(c)] + elif ord(c) < len(self.attr_map): + attr = self.attr_map[ord(c)] + + if not attr or not isinstance(attr, basestring): + raise LitReadError('Unknown attribute %d in tag %s'%(ord(c), tag_name)) + + if attr.startswith('%'): + in_censorship = 1 + state = 'get value length' + continue + + self.buf.write(u' ' + unicode(attr) + u'=') + if attr in ['href', 'src']: + state = 'get href' + else: + state = 'get value length' + elif state == 'get value length': + if not in_censorship: + self.buf.write(u'"') + char_count = ord(c) - 1 + if not char_count: + if not in_censorship: + self.buf.write(u'"') + in_censorship = 0 + state = 'get attr' + state = 'get value' + if ord(c) == 0xffff: + continue + if char_count < 0 or char_count > len(self.bin)-index: + raise LitReadError('Invalid character count %d'%(char_count,)) + elif state == 'get value': + if char_count == 0xfffe: + if not in_censorship: + self.buf.write(unicode(ord(c)-1)) + in_censorship = 0 + state = 'get attr' + elif char_count: + if not in_censorship: + self.buf.write(c) + char_count -= 1 + if not char_count: + if not in_censorship: + self.buf.write('"') + in_censorship = 0 + state = 'get attr' + elif state == 'get custom length': + char_count = ord(c) - 1 + if char_count <= 0 or char_count > len(self.bin)-index: + raise LitReadError('Invalid character count %d'%(char_count,)) + dynamic_tag += 1 + state = 'get custom' + tag_name = '' + elif state == 'get custom': + tag += c + char_count -= 1 + if not char_count: + self.buf.write(tag_name) + state = 'get attr' + elif state == 'get attr length': + char_count = ord(c) - 1 + if char_count <= 0 or char_count > len(self.bin)-index: + raise LitReadError('Invalid character count %d'%(char_count,)) + self.buf.write(u' ') + state = 'get custom attr' + elif state == 'get custom attr': + self.buf.write(c) + char_count -= 1 + if not char_count: + self.buf.write(u'=') + state = 'get value length' + elif state == 'get href': + char_count = ord(c) - 1 + if char_count <= 0: + raise LitReadError('Invalid character count %d'%(char_count,)) + href = self.bin[index+1:index+char_count].decode('ascii') + index += char_count + doc, m, frag = href.partition('#') + path = self.item_path(doc) + if m and frag: + path += m+frag + self.buf.write(u'"%s"'%(path,)) + state = 'get attr' + + self.lingering_space = space_enabled + return index + +class ManifestItem(object): + + def __init__(self, original, internal, mime_type, offset, root, state): + self.original = original + self.internal = internal + self.mime_type = mime_type + self.offset = offset + self.root = root + self.state = state + self.prefix = 'images' if state == 'images' else 'css' if state == 'css' else '' + self.prefix = self.prefix + os.sep if self.prefix else '' + self.path = self.prefix + self.original + + def __eq__(self, other): + if hasattr(other, 'internal'): + return self.internal == other.internal + return self.internal == other + + def __repr__(self): + return self.internal + u'->' + self.path + +class LitFile(object): + + PIECE_SIZE = 16 + + @apply + def magic(): + def fget(self): + opos = self._stream.tell() + self._stream.seek(0) + val = self._stream.read(8) + self._stream.seek(opos) + return val + return property(fget=fget) + + @apply + def version(): + def fget(self): + opos = self._stream.tell() + self._stream.seek(8) + val = u32(self._stream.read(4)) + self._stream.seek(opos) + return val + return property(fget=fget) + + @apply + def hdr_len(): + def fget(self): + opos = self._stream.tell() + self._stream.seek(12) + val = int32(self._stream.read(4)) + self._stream.seek(opos) + return val + return property(fget=fget) + + @apply + def num_pieces(): + def fget(self): + opos = self._stream.tell() + self._stream.seek(16) + val = int32(self._stream.read(4)) + self._stream.seek(opos) + return val + return property(fget=fget) + + @apply + def sec_hdr_len(): + def fget(self): + opos = self._stream.tell() + self._stream.seek(20) + val = int32(self._stream.read(4)) + self._stream.seek(opos) + return val + return property(fget=fget) + + @apply + def guid(): + def fget(self): + opos = self._stream.tell() + self._stream.seek(24) + val = self._stream.read(16) + self._stream.seek(opos) + return val + return property(fget=fget) + + @apply + def header(): + def fget(self): + opos = self._stream.tell() + size = self.hdr_len + self.num_pieces*self.PIECE_SIZE + self.sec_hdr_len + self._stream.seek(0) + val = self._stream.read(size) + self._stream.seek(opos) + return val + return property(fget=fget) + + def __init__(self, stream): + self._stream = stream + if self.magic != 'ITOLITLS': + raise LitReadError('Not a valid LIT file') + if self.version != 1: + raise LitReadError('Unknown LIT version %d'%(self.version,)) + self.read_secondary_header() + self.read_header_pieces() + + + def read_secondary_header(self): + opos = self._stream.tell() + try: + self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE) + bytes = self._stream.read(self.sec_hdr_len) + offset = int32(bytes[4:]) + + while offset < len(bytes): + blocktype = bytes[offset:offset+4] + blockver = u32(bytes[offset+4:]) + + if blocktype == 'CAOL': + if blockver != 2: + raise LitReadError('Unknown CAOL block format %d'%(blockver,)) + self.creator_id = u32(bytes[offset+12:]) + self.entry_chunklen = u32(bytes[offset+20:]) + self.count_chunklen = u32(bytes[offset+24:]) + self.entry_unknown = u32(bytes[offset+28:]) + self.count_unknown = u32(bytes[offset+32:]) + offset += 48 + elif blocktype == 'ITSF': + if blockver != 4: + raise LitReadError('Unknown ITSF block format %d'%(blockver,)) + if u32(bytes[offset+4+16:]): + raise LitReadError('This file has a 64bit content offset') + self.content_offset = u32(bytes[offset+16:]) + self.timestamp = u32(bytes[offset+24:]) + self.language_id = u32(bytes[offset+28:]) + offset += 48 + + if not hasattr(self, 'content_offset'): + raise LitReadError('Could not figure out the content offset') + finally: + self._stream.seek(opos) + + def read_header_pieces(self): + opos = self._stream.tell() + try: + src = self.header[self.hdr_len:] + for i in range(self.num_pieces): + piece = src[i*self.PIECE_SIZE:(i+1)*self.PIECE_SIZE] + if u32(piece[4:]) != 0 or u32(piece[12:]) != 0: + raise LitReadError('Piece %s has 64bit value'%(repr(piece),)) + offset, size = u32(piece), int32(piece[8:]) + self._stream.seek(offset) + piece = self._stream.read(size) + if i == 0: + continue # Dont need this piece + elif i == 1: + if u32(piece[8:]) != self.entry_chunklen or \ + u32(piece[12:]) != self.entry_unknown: + raise LitReadError('Secondary header does not match piece') + self.read_directory(piece) + elif i == 2: + if u32(piece[8:]) != self.count_chunklen or \ + u32(piece[12:]) != self.count_unknown: + raise LitReadError('Secondary header does not match piece') + continue # No data needed from this piece + elif i == 3: + self.piece3_guid = piece + elif i == 4: + self.piece4_guid = piece + finally: + self._stream.seek(opos) + + def read_directory(self, piece): + self.entries = [] + if not piece.startswith('IFCM'): + raise LitReadError('Header piece #1 is not main directory.') + chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28]) + + if 32 + chunk_size*num_chunks != len(piece): + raise LitReadError('IFCM HEADER has incorrect length') + + for chunk in range(num_chunks): + p = 32 + chunk*chunk_size + if piece[p:p+4] != 'AOLL': + continue + remaining = chunk_size - int32(piece[p+4:p+8]) - 48 + if remaining < 0: + raise LitReadError('AOLL remaining count is negative') + + entries = u16(piece[p+chunk_size-2:]) + + if entries <= 0: # Hopefully everything will work even without a correct entries count + entries = (2**16)-1 + + piece = piece[p+48:] + i = 0 + while i < entries: + if remaining <= 0: break + namelen, piece, remaining = encint(piece, remaining) + if namelen != (namelen & 0x7fffffff): + raise LitReadError('Directory entry had 64bit name length.') + if namelen > remaining - 3: + raise LitReadError('Read past end of directory chunk') + name = piece[:namelen] + piece = piece[namelen:] + section, piece, remaining = encint(piece, remaining) + offset, piece, remaining = encint(piece, remaining) + size, piece, remaining = encint(piece, remaining) + + entry = DirectoryEntry(name, section, offset, size) + if name == '::DataSpace/NameList': + self.read_section_names(entry) + elif name == '/manifest': + self.read_manifest(entry) + elif name == '/meta': + self.read_meta(entry) + self.entries.append(entry) + i += 1 + + if not hasattr(self, 'sections'): + raise LitReadError('Lit file does not have a valid NameList') + + if not hasattr(self, 'manifest'): + raise LitReadError('Lit file does not have a valid manifest') + + def read_section_names(self, entry): + opos = self._stream.tell() + try: + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + if len(raw) < 4: + raise LitReadError('Invalid Namelist section') + pos = 4 + self.num_sections = u16(raw[2:pos]) + + self.sections = {} + for section in range(self.num_sections): + size = u16(raw[pos:pos+2]) + pos += 2 + size = size*2 + 2 + if pos + size > len(raw): + raise LitReadError('Invalid Namelist section') + self.sections[section] = raw[pos:pos+size].decode('utf-16-le') + pos += size + finally: + self._stream.seek(opos) + + def read_manifest(self, entry): + opos = self._stream.tell() + try: + self.manifest = [] + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + pos = 0 + while pos < len(raw): + size = ord(raw[pos]) + if size == 0: break + pos += 1 + root = raw[pos:pos+size].decode('utf8') + pos += size + if pos >= len(raw): + raise LitReadError('Truncated manifest.') + for state in ['spine', 'not spine', 'css', 'images']: + num_files = int32(raw[pos:pos+4]) + pos += 4 + if num_files == 0: continue + + i = 0 + while i < num_files: + if pos+5 >= len(raw): + raise LitReadError('Truncated manifest.') + offset = u32(raw[pos:pos+4]) + pos += 4 + + slen = ord(raw[pos]) + pos += 1 + internal = raw[pos:pos+slen].decode('utf8') + pos += slen + + slen = ord(raw[pos]) + pos += 1 + original = raw[pos:pos+slen].decode('utf8') + pos += slen + + slen = ord(raw[pos]) + pos += 1 + mime_type = raw[pos:pos+slen].decode('utf8') + pos += slen +1 + + self.manifest.append(ManifestItem(original, internal, mime_type, offset, root, state)) + i += 1 + finally: + self._stream.seek(opos) + + def read_meta(self, entry): + opos = self._stream.tell() + try: + self._stream.seek(self.content_offset + entry.offset) + raw = self._stream.read(entry.size) + xml = \ +''' + + +'''+\ + UnBinary(raw, self.manifest).buf.getvalue().lstrip() + self.meta = xml + finally: + self._stream.seek(opos) + +def get_metadata(stream): + try: + litfile = LitFile(stream) + mi = OPFReader(cStringIO.StringIO(litfile.meta)) + except: + raise + title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown' + mi = MetaInformation(title, ['Unknown']) + return mi + + + +def main(args=sys.argv): + if len(args) != 2: + print >>sys.stderr, 'Usage: %s file.lit'%(args[0],) + return 1 + print get_metadata(open(args[1], 'rb')) + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/src/libprs500/ebooks/metadata/meta.py b/src/libprs500/ebooks/metadata/meta.py index ef5ad5d23d..9813ae7a0a 100644 --- a/src/libprs500/ebooks/metadata/meta.py +++ b/src/libprs500/ebooks/metadata/meta.py @@ -16,6 +16,7 @@ from libprs500.ebooks.metadata.rtf import get_metadata as rtf_metadata from libprs500.ebooks.lrf.meta import get_metadata as lrf_metadata from libprs500.ebooks.metadata.pdf import get_metadata as pdf_metadata +from libprs500.ebooks.metadata.lit import get_metadata as lit_metadata from libprs500.ebooks.metadata import MetaInformation def get_metadata(stream, stream_type='lrf'): @@ -25,5 +26,7 @@ def get_metadata(stream, stream_type='lrf'): return lrf_metadata(stream) if stream_type == 'pdf': return pdf_metadata(stream) + if stream_type == 'lit': + return lit_metadata(stream) return MetaInformation(None, None) diff --git a/src/libprs500/ebooks/metadata/opf.py b/src/libprs500/ebooks/metadata/opf.py index c9da1e1927..b5476c3668 100644 --- a/src/libprs500/ebooks/metadata/opf.py +++ b/src/libprs500/ebooks/metadata/opf.py @@ -76,18 +76,20 @@ class TOC(list): toc = urlparse(unquote(toc))[2] if not os.path.isabs(toc): toc = os.path.join(cwd, toc) - self.toc = toc - - soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) - for a in soup.findAll('a'): - if not a.has_key('href'): - continue - purl = urlparse(unquote(a['href'])) - href, fragment = purl[2], purl[5] - if not os.path.isabs(href): - href = os.path.join(cwd, href) - txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) - self.append((href, fragment, txt)) + try: + soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) + for a in soup.findAll('a'): + if not a.has_key('href'): + continue + purl = urlparse(unquote(a['href'])) + href, fragment = purl[2], purl[5] + if not os.path.isabs(href): + href = os.path.join(cwd, href) + txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) + self.append((href, fragment, txt)) + self.toc = toc + except: + pass class OPFReader(MetaInformation):