Support for reading metadata from LIT files

2025-07-09 03:04:10 -04:00 · 2007-11-11 20:36:13 +00:00 · 2007-11-11 20:36:13 +00:00 · 8bbdff2949
commit 8bbdff2949
parent 8e62e77168
4 changed files with 729 additions and 12 deletions
--- a/setup.py
+++ b/setup.py
@ -24,6 +24,7 @@ entry_points = {
                             'lrf-meta  = libprs500.ebooks.lrf.meta:main', 
                             'rtf-meta  = libprs500.ebooks.metadata.rtf:main', 
                             'pdf-meta  = libprs500.ebooks.metadata.pdf:main', 
                             'lit-meta  = libprs500.ebooks.metadata.lit:main',
                             'txt2lrf   = libprs500.ebooks.lrf.txt.convert_from:main', 
                             'html2lrf  = libprs500.ebooks.lrf.html.convert_from:main',
                             'markdown-libprs500  = libprs500.ebooks.markdown.markdown:main',
--- a/src/libprs500/ebooks/metadata/lit.py
+++ b/src/libprs500/ebooks/metadata/lit.py
@ -0,0 +1,711 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Support for reading the metadata from a lit file.
 '''
 import sys, struct, cStringIO, os
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.ebooks.metadata.opf import OPFReader
 OPF_ATTR_MAP = [
    None,
    "href",   
    "%never-used",
    "%guid",
    "%minimum_level",
    "%attr5",
    "id",
    "href",
    "media-type",
    "fallback",
    "idref",
    "xmlns:dc",
    "xmlns:oebpackage",
    "role",
    "file-as",
    "event",
    "scheme",
    "title",
    "type",
    "unique-identifier",
    "name",
    "content",
    "xml:lang",                 
    ]
 OPF_TAG_MAP = [
    None,
    "package",
    "dc:Title",
    "dc:Creator",
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "manifest",
    "item",
    "spine",
    "itemref",
    "metadata",
    "dc-metadata",
    "dc:Subject",
    "dc:Description",
    "dc:Publisher",
    "dc:Contributor",
    "dc:Date",
    "dc:Type",
    "dc:Format",
    "dc:Identifier",
    "dc:Source",
    "dc:Language",
    "dc:Relation",
    "dc:Coverage",
    "dc:Rights",
    "x-metadata",
    "meta",
    "tours",
    "tour",
    "site",
    "guide",
    "reference",
    None,
   ]
 class DirectoryEntry(object):
    def __init__(self, name, section, offset, size):
        self.name = name
        self.section = section
        self.offset = offset
        self.size = size
    def __str__(self):
        return '%s\n\tSection: %d\n\tOffset: %d\n\tSize: %d'%(self.name,
                                        self.section, self.offset, self.size)
 class LitReadError(Exception):
    pass
 def u32(bytes):
    b = struct.unpack('BBBB', bytes[:4])
    return b[0] + (b[1] << 8) + (b[2] << 16) + (b[3] << 32)
 def u16(bytes):
    b = struct.unpack('BB', bytes[:2])
    return b[0] + (b[1] << 8)
 def int32(bytes):
    return u32(bytes)&0x7FFFFFFF
 def encint(bytes, remaining):
    pos, val = 0, 0
    while remaining > 0:
        b = ord(bytes[pos])
        pos += 1
        remaining -= 1
        val <<= 7
        val |= (b & 0x7f)
        if b & 0x80 == 0: break
    return val, bytes[pos:], remaining 
 def read_utf8_char(bytes, pos):
    c = ord(bytes[pos])
    mask = 0x80
    if (c & mask):
        elsize = 0
        while c & mask:
            mask >>= 1
            elsize += 1
        if (mask <= 1) or (mask == 0x40):
            raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos])))
    else:
        elsize = 1
    if elsize > 1:
        if elsize + pos > len(bytes):
            raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos])))
        c &= (mask - 1)
        for i in range(1, elsize):
            b = ord(bytes[pos+i])
            if (b & 0xC0) != 0x80:
                raise LitReadError('Invalid UTF8 character: %s'%(repr(bytes[pos:pos+i])))
            c = (c << 6) | (b & 0x3F)
    return unichr(c), pos+elsize
 FLAG_OPENING   = 1
 FLAG_CLOSING   = 2
 FLAG_BLOCK     = 4
 FLAG_HEAD      = 8
 FLAG_ATOM      = 16
 class UnBinary(object):
    pending_indent  = 0
    lingering_space = 0
    was_in_text     = 0
    buf             = cStringIO.StringIO()
    def __init__(self, bin, manifest, attr_map=OPF_ATTR_MAP, tag_map=OPF_TAG_MAP, 
                 tag_to_attr_map=[[] for i in range(43)]):
        self.manifest = manifest
        self.attr_map = attr_map
        self.tag_map  = tag_map
        self.tag_to_attr_map = tag_to_attr_map
        self.opf = self.attr_map is OPF_ATTR_MAP
        self.bin = bin
        self.binary_to_text() 
    def write_spaces(self, depth):
        self.buf.write(u' '.join(u'' for i in range(depth)))
    def item_path(self, internal_id):
        for i in self.manifest:
            if i == internal_id:
                return i.path
        raise LitReadError('Could not find item %s'%(internal_id,))
    def binary_to_text(self, base=0, depth=0):
        space_enabled, saved_space_enabled = 1, 0
        was_indented, is_goingdown = 0, 0
        tag_name = current_map = None
        dynamic_tag = errors = in_censorship = 0
        state = 'text'
        index =  base
        flags = 0
        while index < len(self.bin):
            c, index = read_utf8_char(self.bin, index)
            if state == 'text':
                if ord(c) == 0:
                    state = 'get flags'
                    continue
                if (not self.was_in_text) or space_enabled:            
                    space_enabled = 0;
                    if c in (' ', '\t', '\n', '\r'): 
                        space_enabled += 1
                    else:
                        self.was_in_text = 1
                if c == '\v': 
                    c = '\n'
                pending_indent = 0
                self.buf.write(c)
            elif state == 'get flags':
                if ord(c) == 0:
                    state = 'text'
                    continue
                flags = ord(c)
                state = 'get tag'
            elif state == 'get tag':
                state = 'text' if ord(c) == 0 else 'get attr'
                if flags & FLAG_OPENING:
                    if space_enabled and ((not self.was_in_text) or (flags &(FLAG_BLOCK|FLAG_HEAD))):
                        self.pending_indent += 1
                    if self.pending_indent or self.opf:
                        was_indented += 1
                        self.buf.write(u'\n')
                        self.write_spaces(depth)
                        pending_indent = 0
                    if (flags & FLAG_HEAD) or (flags & FLAG_BLOCK) or \
                        self.opf or depth == 0:
                        pending_indent = 1
                    tag = ord(c)
                    self.buf.write('<')
                    if not (flags & FLAG_CLOSING):
                        is_goingdown = 1
                    if tag == 0x8000:
                        state = 'get custom length'
                        continue
                    if flags & FLAG_ATOM:
                        raise LitReadError('TODO: Atoms not yet implemented')
                    elif tag < len(self.tag_map):
                        tag_name = self.tag_map[tag]
                        current_map = self.tag_to_attr_map[tag]
                    else:
                        dynamic_tag += 1
                        errors += 1
                        tag_name = '?'+unichr(tag)+'?'
                        current_map = self.tag_to_attr_map[tag]
                        print 'WARNING: tag %s unknown'%(unichr(tag),)
                    self.buf.write(unicode(tag_name))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitReadError('Extra closing tag')
                    self.lingering_space = space_enabled
                    return index
            elif state == 'get attr':
                in_censorship = 0
                if ord(c) == 0:
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
                        self.buf.write(u' />')
                    else:
                        self.buf.write(u'>')
                        if not self.opf and (flags & (FLAG_BLOCK|FLAG_HEAD)):
                            pending_indent += 1
                        index = self.binary_to_text(base=index, depth=depth+1)
                        is_goingdown = 0
                        if not tag_name:
                            raise LitReadError('Tag ends before it begins.')
                        saved_space_enabled = space_enabled
                        space_enabled = self.lingering_space
                        if space_enabled and was_indented and not self.was_in_text:
                            self.buf.write(u'\n')
                            self.write_spaces(depth)
                        self.buf.write(u'</'+tag_name+u'>')
                        if (space_enabled and self.opf) or (flags & (FLAG_BLOCK|FLAG_HEAD)):
                            self.pending_indent += 1
                        dynamic_tag = 0
                        tag_name = None
                        space_enabled = saved_space_enabled
                    self.was_in_text = 0
                    state = 'text'
                else:
                    if ord(c) == 0x8000:
                        state = 'get attr length'
                        continue
                    attr = None
                    if ord(c) < len(current_map) and current_map[ord(c)]:
                        attr = current_map[ord(c)]                        
                    elif ord(c) < len(self.attr_map):
                        attr = self.attr_map[ord(c)]
                    if not attr or not isinstance(attr, basestring):
                        raise LitReadError('Unknown attribute %d in tag %s'%(ord(c), tag_name))
                    if attr.startswith('%'):
                        in_censorship = 1
                        state = 'get value length'
                        continue
                    self.buf.write(u' ' + unicode(attr) + u'=')
                    if attr in ['href', 'src']:
                        state = 'get href'
                    else:
                        state = 'get value length'
            elif state == 'get value length':
                if not in_censorship:
                    self.buf.write(u'"')
                char_count = ord(c) - 1
                if not char_count:
                    if not in_censorship:
                        self.buf.write(u'"')
                    in_censorship = 0
                    state = 'get attr'
                state = 'get value'
                if ord(c) == 0xffff:
                    continue
                if char_count < 0 or char_count > len(self.bin)-index:
                    raise LitReadError('Invalid character count %d'%(char_count,))
            elif state == 'get value':
                if char_count == 0xfffe:
                    if not in_censorship:
                        self.buf.write(unicode(ord(c)-1))
                    in_censorship = 0
                    state = 'get attr'
                elif char_count:
                    if not in_censorship:
                        self.buf.write(c)
                    char_count -= 1
                if not char_count:
                    if not in_censorship:
                        self.buf.write('"')
                    in_censorship = 0
                    state = 'get attr'
            elif state == 'get custom length':
                char_count = ord(c) - 1
                if char_count <= 0 or char_count > len(self.bin)-index:
                    raise LitReadError('Invalid character count %d'%(char_count,))
                dynamic_tag += 1
                state = 'get custom'
                tag_name = ''
            elif state == 'get custom':
                tag += c
                char_count -= 1
                if not char_count:
                    self.buf.write(tag_name)
                    state = 'get attr'
            elif state == 'get attr length':
                char_count = ord(c) - 1
                if char_count <= 0 or char_count > len(self.bin)-index:
                    raise LitReadError('Invalid character count %d'%(char_count,))
                self.buf.write(u' ')
                state = 'get custom attr'
            elif state == 'get custom attr':
                self.buf.write(c)
                char_count -= 1
                if not char_count:
                    self.buf.write(u'=')
                    state = 'get value length'
            elif state == 'get href':
                char_count = ord(c) - 1
                if char_count <= 0:
                    raise LitReadError('Invalid character count %d'%(char_count,))
                href = self.bin[index+1:index+char_count].decode('ascii')
                index += char_count 
                doc, m, frag = href.partition('#')
                path = self.item_path(doc)
                if m and frag:
                    path += m+frag
                self.buf.write(u'"%s"'%(path,))
                state = 'get attr'
        self.lingering_space = space_enabled
        return index 
 class ManifestItem(object):
    def __init__(self, original, internal, mime_type, offset, root, state):
        self.original = original
        self.internal = internal
        self.mime_type = mime_type
        self.offset = offset
        self.root = root
        self.state = state
        self.prefix = 'images' if state == 'images' else 'css' if state == 'css' else ''
        self.prefix = self.prefix + os.sep if self.prefix else ''
        self.path = self.prefix + self.original
    def __eq__(self, other):
        if hasattr(other, 'internal'):
            return self.internal == other.internal
        return self.internal == other
    def __repr__(self):
        return self.internal + u'->' + self.path 
 class LitFile(object):
    PIECE_SIZE    = 16
    @apply
    def magic():
        def fget(self):
            opos = self._stream.tell()
            self._stream.seek(0)
            val = self._stream.read(8)
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    @apply
    def version():
        def fget(self):
            opos = self._stream.tell()
            self._stream.seek(8)
            val = u32(self._stream.read(4))
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    @apply
    def hdr_len():
        def fget(self):
            opos = self._stream.tell()
            self._stream.seek(12)
            val = int32(self._stream.read(4))
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    @apply
    def num_pieces():
        def fget(self):
            opos = self._stream.tell()
            self._stream.seek(16)
            val = int32(self._stream.read(4))
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    @apply
    def sec_hdr_len():
        def fget(self):
            opos = self._stream.tell()
            self._stream.seek(20)
            val = int32(self._stream.read(4))
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    @apply
    def guid():
        def fget(self):
            opos = self._stream.tell()
            self._stream.seek(24)
            val = self._stream.read(16)
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    @apply
    def header():
        def fget(self):
            opos = self._stream.tell()
            size = self.hdr_len + self.num_pieces*self.PIECE_SIZE + self.sec_hdr_len
            self._stream.seek(0)
            val  = self._stream.read(size)
            self._stream.seek(opos)
            return val
        return property(fget=fget)
    def __init__(self, stream):
        self._stream = stream
        if self.magic != 'ITOLITLS':
            raise LitReadError('Not a valid LIT file')
        if self.version != 1:
            raise LitReadError('Unknown LIT version %d'%(self.version,))
        self.read_secondary_header()
        self.read_header_pieces()
    def read_secondary_header(self):
        opos = self._stream.tell()
        try:
            self._stream.seek(self.hdr_len + self.num_pieces*self.PIECE_SIZE)
            bytes = self._stream.read(self.sec_hdr_len)
            offset = int32(bytes[4:])
            while offset < len(bytes):
                blocktype = bytes[offset:offset+4]
                blockver  = u32(bytes[offset+4:])
                if blocktype == 'CAOL':
                    if blockver != 2:
                        raise LitReadError('Unknown CAOL block format %d'%(blockver,))
                    self.creator_id     = u32(bytes[offset+12:])
                    self.entry_chunklen = u32(bytes[offset+20:])
                    self.count_chunklen = u32(bytes[offset+24:])
                    self.entry_unknown  = u32(bytes[offset+28:])
                    self.count_unknown  = u32(bytes[offset+32:])
                    offset += 48
                elif blocktype == 'ITSF':
                    if blockver != 4:
                        raise LitReadError('Unknown ITSF block format %d'%(blockver,))
                    if u32(bytes[offset+4+16:]):
                        raise LitReadError('This file has a 64bit content offset')
                    self.content_offset = u32(bytes[offset+16:])
                    self.timestamp      = u32(bytes[offset+24:]) 
                    self.language_id    = u32(bytes[offset+28:])
                    offset += 48
            if not hasattr(self, 'content_offset'):
                raise LitReadError('Could not figure out the content offset')
        finally:
            self._stream.seek(opos)
    def read_header_pieces(self):
        opos = self._stream.tell()
        try:
            src = self.header[self.hdr_len:]
            for i in range(self.num_pieces):
                piece = src[i*self.PIECE_SIZE:(i+1)*self.PIECE_SIZE]
                if u32(piece[4:]) != 0 or u32(piece[12:]) != 0:
                    raise LitReadError('Piece %s has 64bit value'%(repr(piece),))
                offset, size = u32(piece), int32(piece[8:])
                self._stream.seek(offset)
                piece = self._stream.read(size)
                if i == 0:
                    continue # Dont need this piece
                elif i == 1:
                    if u32(piece[8:])  != self.entry_chunklen or \
                       u32(piece[12:]) != self.entry_unknown:
                        raise LitReadError('Secondary header does not match piece')
                    self.read_directory(piece)
                elif i == 2:
                    if u32(piece[8:])  != self.count_chunklen or \
                       u32(piece[12:]) != self.count_unknown:
                        raise LitReadError('Secondary header does not match piece')
                    continue # No data needed from this piece
                elif i == 3:
                    self.piece3_guid = piece
                elif i == 4:
                    self.piece4_guid = piece
        finally:
            self._stream.seek(opos)
    def read_directory(self, piece):
        self.entries = []
        if not piece.startswith('IFCM'):
            raise LitReadError('Header piece #1 is not main directory.')
        chunk_size, num_chunks = int32(piece[8:12]), int32(piece[24:28])
        if 32 + chunk_size*num_chunks != len(piece):
            raise LitReadError('IFCM HEADER has incorrect length')
        for chunk in range(num_chunks):
            p = 32 + chunk*chunk_size
            if piece[p:p+4] != 'AOLL':
                continue
            remaining = chunk_size - int32(piece[p+4:p+8]) - 48
            if remaining < 0:
                raise LitReadError('AOLL remaining count is negative')
            entries = u16(piece[p+chunk_size-2:])
            if entries <= 0: # Hopefully everything will work even without a correct entries count
                entries = (2**16)-1 
            piece = piece[p+48:]
            i = 0
            while i < entries:
                if remaining <= 0: break
                namelen, piece, remaining = encint(piece, remaining)
                if namelen != (namelen & 0x7fffffff):
                    raise LitReadError('Directory entry had 64bit name length.')
                if namelen > remaining - 3:
                    raise LitReadError('Read past end of directory chunk')
                name = piece[:namelen]
                piece = piece[namelen:]
                section, piece, remaining = encint(piece, remaining)
                offset, piece, remaining = encint(piece, remaining)
                size, piece, remaining = encint(piece, remaining)
                entry = DirectoryEntry(name, section, offset, size)
                if name == '::DataSpace/NameList':
                    self.read_section_names(entry)
                elif name == '/manifest':
                    self.read_manifest(entry)
                elif name == '/meta':
                    self.read_meta(entry)
                self.entries.append(entry)
                i += 1
            if not hasattr(self, 'sections'):
                raise LitReadError('Lit file does not have a valid NameList')
            if not hasattr(self, 'manifest'):
                raise LitReadError('Lit file does not have a valid manifest')
    def read_section_names(self, entry):
        opos = self._stream.tell()
        try:
            self._stream.seek(self.content_offset + entry.offset)
            raw = self._stream.read(entry.size)
            if len(raw) < 4:
                raise LitReadError('Invalid Namelist section')
            pos = 4
            self.num_sections = u16(raw[2:pos])
            self.sections = {}
            for section in range(self.num_sections):
                size = u16(raw[pos:pos+2])
                pos += 2
                size = size*2 + 2
                if pos + size > len(raw):
                    raise LitReadError('Invalid Namelist section')
                self.sections[section] = raw[pos:pos+size].decode('utf-16-le')
                pos += size                
        finally:
            self._stream.seek(opos)
    def read_manifest(self, entry):
        opos = self._stream.tell()
        try:
            self.manifest = []
            self._stream.seek(self.content_offset + entry.offset)
            raw = self._stream.read(entry.size)
            pos = 0
            while pos < len(raw):
                size = ord(raw[pos])
                if size == 0: break
                pos += 1
                root = raw[pos:pos+size].decode('utf8')
                pos += size
                if pos >= len(raw):
                    raise LitReadError('Truncated manifest.')
                for state in ['spine', 'not spine', 'css', 'images']:
                    num_files = int32(raw[pos:pos+4])
                    pos += 4
                    if num_files == 0: continue
                    i = 0
                    while i < num_files:
                        if pos+5 >= len(raw):
                            raise LitReadError('Truncated manifest.')
                        offset = u32(raw[pos:pos+4])
                        pos += 4
                        slen = ord(raw[pos])
                        pos += 1
                        internal = raw[pos:pos+slen].decode('utf8')
                        pos += slen
                        slen = ord(raw[pos])
                        pos += 1
                        original = raw[pos:pos+slen].decode('utf8')
                        pos += slen
                        slen = ord(raw[pos])
                        pos += 1
                        mime_type = raw[pos:pos+slen].decode('utf8')
                        pos += slen +1
                        self.manifest.append(ManifestItem(original, internal, mime_type, offset, root, state))                        
                        i += 1
        finally:
            self._stream.seek(opos)
    def read_meta(self, entry):
        opos = self._stream.tell()
        try:
            self._stream.seek(self.content_offset + entry.offset)
            raw = self._stream.read(entry.size)
            xml = \
 '''
 <?xml version="1.0" encoding="UTF-8" ?>
 <!DOCTYPE package
  PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
  "http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd">
 '''+\
                UnBinary(raw, self.manifest).buf.getvalue().lstrip()
            self.meta = xml
        finally:
            self._stream.seek(opos)
 def get_metadata(stream):
    try:
        litfile = LitFile(stream)
        mi = OPFReader(cStringIO.StringIO(litfile.meta))        
    except:
        raise
        title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown'
        mi = MetaInformation(title, ['Unknown'])
    return mi
 def main(args=sys.argv):
    if len(args) != 2:
        print >>sys.stderr, 'Usage: %s file.lit'%(args[0],)
        return 1
    print get_metadata(open(args[1], 'rb'))
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/libprs500/ebooks/metadata/meta.py
+++ b/src/libprs500/ebooks/metadata/meta.py
@ -16,6 +16,7 @@
 from libprs500.ebooks.metadata.rtf import get_metadata as rtf_metadata
 from libprs500.ebooks.lrf.meta import get_metadata as lrf_metadata
 from libprs500.ebooks.metadata.pdf import get_metadata as pdf_metadata
 from libprs500.ebooks.metadata.lit import get_metadata as lit_metadata
 from libprs500.ebooks.metadata import MetaInformation
 def get_metadata(stream, stream_type='lrf'):
@ -25,5 +26,7 @@ def get_metadata(stream, stream_type='lrf'):
        return lrf_metadata(stream)
    if stream_type == 'pdf':
        return pdf_metadata(stream)
    if stream_type == 'lit':
        return lit_metadata(stream)
    return MetaInformation(None, None)
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -76,18 +76,20 @@ class TOC(list):
            toc = urlparse(unquote(toc))[2]
            if not os.path.isabs(toc):
                toc = os.path.join(cwd, toc)
-            self.toc = toc
+            try:
-    
+                soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
-            soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
+                for a in soup.findAll('a'):
-            for a in soup.findAll('a'):
+                    if not a.has_key('href'):
-                if not a.has_key('href'):
+                        continue
-                    continue
+                    purl = urlparse(unquote(a['href']))
-                purl = urlparse(unquote(a['href']))
+                    href, fragment = purl[2], purl[5]
-                href, fragment = purl[2], purl[5]
+                    if not os.path.isabs(href):
-                if not os.path.isabs(href):
+                        href = os.path.join(cwd, href)
-                    href = os.path.join(cwd, href)
+                    txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
-                txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
+                    self.append((href, fragment, txt))
-                self.append((href, fragment, txt))
+                self.toc = toc
            except:
                pass
 class OPFReader(MetaInformation):