Refactored OPF creation code. Implemented Table of Contents support in feeds2disk.

2025-08-30 23:00:21 -04:00 · 2008-03-14 19:25:48 +00:00 · 2008-03-14 19:25:48 +00:00 · 6982652f92
commit 6982652f92
parent 748c184ccb
18 changed files with 482 additions and 209 deletions
--- a/6
+++ b/6
@ -1,6 +1,6 @@
 PYTHON = python

-all : gui2 translations
+all : gui2 translations resources

 clean : 
 	cd src/libprs500/gui2 && ${PYTHON} make.py clean
@ -13,4 +13,8 @@ test : gui2

 translations :
 	cd src/libprs500 && ${PYTHON} translations/__init__.py
+
+resources:	
+	${PYTHON} resources.py
+    
    
--- a/resources.py
+++ b/resources.py
@ -0,0 +1,39 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Compile resource files.
+'''
+import os, sys
+sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
+from libprs500 import __appname__
+
+RESOURCES = dict(
+    opf_template = '%p/ebooks/metadata/opf.xml',
+    ncx_template = '%p/ebooks/metadata/ncx.xml',
+                 )
+
+def main(args=sys.argv):
+    data = ''
+    for key, value in RESOURCES.items():
+        path = value.replace('%p', 'src'+os.sep+__appname__)
+        bytes = repr(open(path, 'rb').read())
+        data += key + ' = ' + bytes + '\n\n'
+    open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data) 
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -60,6 +60,8 @@ def update_css(ncss, ocss):
 def munge_paths(basepath, url):
    purl = urlparse(unquote(url),)
    path, fragment = purl[2], purl[5]
+    if path:
+        path = path.replace('/', os.sep)
    if not path:
        path = basepath
    elif not os.path.isabs(path):
@ -223,7 +225,6 @@ class HTMLConverter(object):
        self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
        self.image_memory = []
        self.id_counter = 0
-        self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
        self.unused_target_blocks = [] #: Used to remove extra TextBlocks
        self.link_level  = 0    #: Current link level
        self.memory = []        #: Used to ensure that duplicate CSS unhandled erros are not reported
@ -543,7 +544,7 @@ class HTMLConverter(object):
        
        path, fragment = munge_paths(self.target_prefix, tag['href'])
        return {'para':para, 'text':text, 'path':os.path.abspath(path), 
-                'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)}
+                'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
        
    
    def get_text(self, tag, limit=None):
@ -637,13 +638,12 @@ class HTMLConverter(object):
        return outside_links
            
    def create_toc(self, toc):
-        for (path, fragment, txt) in toc:
-            ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
-            self.toc_from_metadata = True
-            if not fragment and path in self.tops:
-                self.book.addTocEntry(ascii_text, self.tops[path])                
+        for item in toc.top_level_items():
+            ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
+            if not item.fragment and item.abspath in self.tops:
+                self.book.addTocEntry(ascii_text, self.tops[item.abspath])                
            else:
-                url = path+fragment
+                url = item.abspath+item.fragment
                if url in self.targets:
                    self.book.addTocEntry(ascii_text, self.targets[url])
                    
@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
            options.cover = None
            cover = opf.cover            
            if cover:
+                cover = cover.replace('/', os.sep)
                if not os.path.isabs(cover):
                    cover = os.path.join(dirpath, cover)
                if os.access(cover, os.R_OK):
--- a/src/libprs500/ebooks/lrf/html/convert_to.py
+++ b/src/libprs500/ebooks/lrf/html/convert_to.py
@ -65,7 +65,7 @@ class LRFConverter(object):
    def create_metadata(self):
        self.logger.info('Reading metadata...')
        mi = get_metadata(self.lrf)
-        self.opf = OPFCreator(mi)
+        self.opf = OPFCreator(self.output_dir, mi)
        
    def create_page_styles(self):
        self.page_css = ''
@ -126,4 +126,4 @@ def main(args=sys.argv):


 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/libprs500/ebooks/metadata/init.py
+++ b/src/libprs500/ebooks/metadata/init.py
@ -45,12 +45,13 @@ class MetaInformation(object):
        ans = MetaInformation(mi.title, mi.authors)
        for attr in ('author_sort', 'title_sort', 'comments', 'category',
                     'publisher', 'series', 'series_index', 'rating',
-                     'isbn', 'tags', 'cover_data', 'libprs_id'):
+                     'isbn', 'tags', 'cover_data', 'application_id',
+                     'manifest', 'spine', 'toc', 'cover'):
            if hasattr(mi, attr):
                setattr(ans, attr, getattr(mi, attr))
        
    
-    def __init__(self, title, authors):
+    def __init__(self, title, authors=['Unknown']):
        '''
        @param title: title or "Unknown" or a MetaInformation object
        @param authors: List of strings or []
@ -76,8 +77,11 @@ class MetaInformation(object):
        self.isbn         = None if not mi else mi.isbn
        self.tags         = []  if not mi else mi.tags
        self.cover_data   = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
-        self.libprs_id    = mi.libprs_id  if (mi and hasattr(mi, 'libprs_id')) else None
-         
+        self.application_id    = mi.application_id  if (mi and hasattr(mi, 'application_id')) else None
+        self.manifest = getattr(mi, 'manifest', None) 
+        self.toc      = getattr(mi, 'toc', None)
+        self.spine    = getattr(mi, 'spine', None)
+        self.cover    = getattr(mi, 'cover', None)
    
    def smart_update(self, mi):
        '''
@ -92,7 +96,7 @@ class MetaInformation(object):
            
        for attr in ('author_sort', 'title_sort', 'comments', 'category',
                     'publisher', 'series', 'series_index', 'rating',
-                     'isbn', 'libprs_id'):
+                     'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
            if hasattr(mi, attr):
                val = getattr(mi, attr)
                if val is not None:
@ -117,4 +121,4 @@ class MetaInformation(object):
        return ans.strip()
    
    def __nonzero__(self):
-        return bool(self.title or self.author or self.comments or self.category)
+        return bool(self.title or self.author or self.comments or self.category)
--- a/src/libprs500/ebooks/metadata/meta.py
+++ b/src/libprs500/ebooks/metadata/meta.py
@ -51,7 +51,7 @@ def metadata_from_formats(formats):
        ext = path_to_ext(path)
        stream = open(path, 'rb')
        mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
-        if getattr(mi, 'libprs_id', None) is not None:
+        if getattr(mi, 'application_id', None) is not None:
            return mi
    
    return mi
@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
        if os.access(c, os.R_OK):
            opf = opf_metadata(os.path.abspath(c))
        
-    if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None:
+    if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
        return opf
    
    try:
@ -147,7 +147,7 @@ def opf_metadata(opfpath):
    f = open(opfpath, 'rb')
    opf = OPFReader(f, os.path.dirname(opfpath))
    try:
-        if opf.libprs_id is not None:
+        if opf.application_id is not None:
            mi = MetaInformation(opf, None)
            if hasattr(opf, 'cover') and opf.cover:
                cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
--- a/src/libprs500/ebooks/metadata/ncx.xml
+++ b/src/libprs500/ebooks/metadata/ncx.xml
@ -0,0 +1,27 @@
+<ncx version="2005-1" 
+     xml:lang="en" 
+     xmlns="http://www.daisy.org/z3986/2005/ncx/"
+     xmlns:py="http://genshi.edgewall.org/"
+>
+    <head>
+        <meta name="dtb:uid" content="${uid}"/>
+        <meta name="dtb:depth" content="${toc.depth()}"/>
+        <meta name="dtb:generator" content="${__appname__}"/>
+        <meta name="dtb:totalPageCount" content="0"/>
+        <meta name="dtb:maxPageNumber" content="0"/>
+    </head>
+    <docTitle><text>Table of Contents</text></docTitle>
+    
+    <py:def function="navpoint(np, level)">
+        ${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
+            ${'%*s'%(4*level,'')}<navLabel>
+                ${'%*s'%(4*level,'')}<text>${np.text}</text>
+            ${'%*s'%(4*level,'')}</navLabel>
+            ${'%*s'%(4*level,'')}<content src="${str(np.href)+(('#' + str(np.fragment)) if np.fragment else '')}" />
+            <py:for each="np2 in np">${navpoint(np2, level+1)}</py:for>
+        ${'%*s'%(4*level,'')}</navPoint>
+    </py:def>
+    <navMap>
+    <py:for each="np in toc">${navpoint(np, 0)}</py:for>
+    </navMap>
+</ncx>
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -12,18 +12,21 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import uuid
 '''Read/Write metadata from Open Packaging Format (.opf) files.'''

-import sys, re, os, glob
+import sys, re, os, mimetypes
 from urllib import unquote
 from urlparse import urlparse
 import xml.dom.minidom as dom
 from itertools import repeat

+from libprs500 import __appname__
 from libprs500.ebooks.metadata import MetaInformation
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 from libprs500.ebooks.lrf import entity_to_unicode
 from libprs500.ebooks.metadata import get_parser
+from libprs500.ebooks.metadata.toc import TOC

 class ManifestItem(object):
    def __init__(self, item, cwd):
@ -40,6 +43,14 @@ class ManifestItem(object):
        
    def __unicode__(self):
        return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
+    
+    def __getitem__(self, index):
+        if index == 0:
+            return self.href
+        if index == 1:
+            return self.media_type
+        raise IndexError('%d out of bounds.'%index)
+        

 class Manifest(list):
    
@ -81,85 +92,11 @@ class Spine(object):
    def items(self):
        for i in self.linear_ids + self.nonlinear_ids:
            yield  self.manifest.item(i)
+            
+    def __iter__(self):
+        for i in self.linear_ids + self.nonlinear_ids:
+            yield i

-class TOC(list):
-    
-    def __init__(self, opfreader, cwd):
-        self.toc = None
-        toc = opfreader.soup.find('spine', toc=True)
-        if toc is not None:
-            toc = toc['toc']
-        if toc is None:
-            try:
-                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
-            except:
-                for item in opfreader.manifest:
-                    if 'toc' in item.href.lower():
-                        toc = item.href
-                        break
-                            
-        if toc is not None:
-            if toc.lower() != 'ncx':
-                toc = urlparse(unquote(toc))[2]
-                if not os.path.isabs(toc):
-                    toc = os.path.join(cwd, toc)
-                try:
-                    if not os.path.exists(toc):
-                        bn  = os.path.basename(toc)
-                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
-                        toc = os.path.join(os.path.dirname(toc), bn)
-                    
-                    self.read_html_toc(toc, cwd)
-                    self.toc = toc
-                except:
-                    pass
-            else:
-                cwd = os.path.abspath(cwd)
-                m = glob.glob(os.path.join(cwd, '*.ncx'))
-                if m:
-                    toc = m[0]
-                    try:
-                        self.read_ncx_toc(toc)
-                        self.toc = toc
-                    except:
-                        raise
-                        pass
-            
-    def read_ncx_toc(self, toc):
-        bdir = os.path.dirname(toc)
-        soup = BeautifulStoneSoup(open(toc, 'rb').read(),
-                                  convertEntities=BeautifulSoup.HTML_ENTITIES)
-        elems = soup.findAll('navpoint')
-        elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
-        
-        for elem in elems:
-            txt = u''
-            for nl in elem.findAll('navlabel'):
-                for text in nl.findAll('text'):
-                    txt += ''.join([unicode(s) for s in text.findAll(text=True)])
-            
-            content = elem.find('content')
-            if content is None or not content.has_key('src') or not txt:
-                continue
-            
-            purl = urlparse(unquote(content['src']))
-            href, fragment = purl[2], purl[5]
-            if not os.path.isabs(href):
-                href = os.path.join(bdir, href)
-            self.append((href, fragment, txt))
-        
-    
-    def read_html_toc(self, toc, cwd):
-        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
-        for a in soup.findAll('a'):
-            if not a.has_key('href'):
-                continue
-            purl = urlparse(unquote(a['href']))
-            href, fragment = purl[2], purl[5]
-            if not os.path.isabs(href):
-                href = os.path.join(cwd, href)
-            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
-            self.append((href, fragment, txt))
            

 class standard_field(object):
@ -178,21 +115,21 @@ class OPF(MetaInformation):
    MIMETYPE = 'application/oebps-package+xml'
    ENTITY_PATTERN = re.compile(r'&(\S+?);')
    
-    uid           = standard_field('uid')
-    libprs_id     = standard_field('libprs_id')
-    title         = standard_field('title')
-    authors       = standard_field('authors')
-    title_sort    = standard_field('title_sort')
-    author_sort   = standard_field('author_sort')
-    comments      = standard_field('comments')
-    category      = standard_field('category')
-    publisher     = standard_field('publisher')
-    isbn          = standard_field('isbn')
-    cover         = standard_field('cover')
-    series        = standard_field('series')
-    series_index  = standard_field('series_index')
-    rating        = standard_field('rating')
-    tags          = standard_field('tags')
+    uid            = standard_field('uid')
+    application_id = standard_field('application_id')
+    title          = standard_field('title')
+    authors        = standard_field('authors')
+    title_sort     = standard_field('title_sort')
+    author_sort    = standard_field('author_sort')
+    comments       = standard_field('comments')
+    category       = standard_field('category')
+    publisher      = standard_field('publisher')
+    isbn           = standard_field('isbn')
+    cover          = standard_field('cover')
+    series         = standard_field('series')
+    series_index   = standard_field('series_index')
+    rating         = standard_field('rating')
+    tags           = standard_field('tags')
    
    HEADER = '''\
 <?xml version="1.0" encoding="UTF-8"?>
@ -207,14 +144,14 @@ class OPF(MetaInformation):
        if not hasattr(self, 'soup'):
            self.soup = BeautifulStoneSoup(u'''\
 %s
-<package unique-identifier="libprs_id">
+<package unique-identifier="%s_id">
    <metadata>
        <dc-metadata
         xmlns:dc="http://purl.org/dc/elements/1.1/"
         xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
    </metadata>
 </package>
-'''%self.HEADER)
+'''%(__appname__, self.HEADER))
    
    def _commit(self, doc):
        self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
@ -403,15 +340,15 @@ class OPF(MetaInformation):
            self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')], 
                                       replace=True)
        
-    def get_libprs_id(self):
+    def get_application_id(self):
        for item in self.soup.package.metadata.findAll('dc:identifier'):
-            if item.has_key('scheme') and item['scheme'] == 'libprs':
+            if item.has_key('scheme') and item['scheme'] == __appname__:
                return str(item.string).strip()
        return None
    
-    def set_libprs_id(self, val):
+    def set_application_id(self, val):
        if val:
-            self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')], 
+            self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')], 
                                       replace=True)
    
    def get_cover(self):
@ -564,61 +501,72 @@ class OPFReader(OPF):
            stream.close()
        self.manifest = Manifest(self.soup, dir)
        self.spine = Spine(self.soup, self.manifest)
-        self.toc = TOC(self, dir)
+        self.toc = TOC()
+        self.toc.read_from_opf(self)
        self.cover_data = (None, None)
        
-class OPFCreator(OPF):
+class OPFCreator(MetaInformation):
+    
+    def __init__(self, base_path, *args, **kwargs):
+        '''
+        Initialize.
+        @param base_path: An absolute path to the directory in which this OPF file
+        will eventually be. This is used by the L{create_manifest} method
+        to convert paths to files into relative paths.
+        '''
+        MetaInformation.__init__(self, *args, **kwargs)
+        self.base_path = os.path.abspath(base_path)
+        if self.application_id is None:
+            self.application_id = str(uuid.uuid4())
+        self.toc = None
+        if isinstance(self.manifest, Manifest):
+            manifest = []
+            for path, mt in self.manifest:
+                if not path.startswith(self.base_path):
+                    raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
+                path = path[len(self.base_path)+1:]
+                manifest.append((path, mt))
+            self.manifest = manifest
    
-    def __init__(self, mi):
-        self.title = mi.title
-        self.authors = mi.authors
-        if mi.category:
-            self.category = mi.category
-        if mi.comments:
-            self.comments = mi.comments
-        if mi.publisher:
-            self.publisher = mi.publisher
-        if mi.rating:
-            self.rating = mi.rating
-        if mi.series:
-            self.series = mi.series
-        if mi.series_index:
-            self.series_index = mi.series_index
-        if mi.tags:
-            self.tags = mi.tags
-        if mi.isbn:
-            self.isbn = mi.isbn
-        self.cover_data = mi.cover_data
-        if hasattr(mi, 'libprs_id'):
-            self.libprs_id = mi.libprs_id
-        if hasattr(mi, 'uid'):
-            self.uid = mi.uid    
-        
    def create_manifest(self, entries):
        '''
        Create <manifest>
-        @param entries: List of (URL, mime-type)
+        @param entries: List of (path, mime-type)
+        @param base_path: It is used to convert each path into a path relative to itself
        @type entries: list of 2-tuples
        '''
-        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
-        package = doc.documentElement
-        manifest = doc.createElement('manifest')
-        package.appendChild(manifest)
-        package.appendChild(doc.createTextNode('\n'))
-        
-        self.href_map = {}
-        
-        for href, media_type in entries:
-            item = doc.createElement('item')
-            item.setAttribute('href', href)
-            item.setAttribute('media-type', media_type)
-            self.href_map[href] = str(hash(href))
-            item.setAttribute('id', self.href_map[href])
-            manifest.appendChild(item)
-            manifest.appendChild(doc.createTextNode('\n'))
-            
-        self._commit(doc)
+        rentries = []
+        base_path = self.base_path
+        mimetypes.init()
+        for href, mt in entries:
+            href = os.path.abspath(href)
+            if not href.startswith(base_path):
+                raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
+            href = href[len(base_path)+1:].replace(os.sep, '/')
+            if not mt:
+                mt = mimetypes.guess_type(href)[0]
+                if not mt:
+                    mt = ''
+            rentries.append((href, mt))
            
+        self.manifest = rentries
+        
+    def create_manifest_from_files_in(self, files_and_dirs):
+        entries = []
+        
+        def dodir(dir):
+            for root, dirs, files in os.walk(dir):
+                for name in files:
+                    path = os.path.join(root, name)
+                    entries.append((path, None)) 
+        
+        for i in files_and_dirs:
+            if os.path.isdir(i):
+                dodir(i)
+            else:
+                entries.append((i, None))
+                
+        self.create_manifest(entries)    
            
    def create_spine(self, entries):
        '''
@ -626,19 +574,43 @@ class OPFCreator(OPF):
        @param: List of paths
        @type param: list of strings
        '''
-        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
-        package = doc.documentElement
-        spine = doc.createElement('spine')
-        package.appendChild(spine)
-        package.appendChild(doc.createTextNode('\n'))
+        self.spine = []
        
-        for href in entries:
-            itemref = doc.createElement('itemref')
-            itemref.setAttribute('idref', self.href_map[href])
-            spine.appendChild(itemref)
-            spine.appendChild(doc.createTextNode('\n'))
+        for path in entries:
+            if not os.path.isabs(path):
+                path = os.path.join(self.base_path, path)
+            if not path.startswith(self.base_path):
+                raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
+            href = path[len(self.base_path)+1:]
+            in_manifest = False
+            for i, m in enumerate(self.manifest):
+                if m[0] == href:
+                    in_manifest = True
+                    break
+            if not in_manifest:
+                raise ValueError('%s is not in the manifest. (%s)'%(href, path))
+            self.spine.append(i)
+         
            
-        self._commit(doc)
+        
+    def set_toc(self, toc):
+        '''
+        Set the toc. You must call L{create_spine} before calling this
+        method.
+        @param toc: A Table of Contents
+        @type toc: L{TOC}
+        '''
+        self.toc = toc
+        
+    def render(self, opf_stream, ncx_stream=None):
+        from libprs500.resources import opf_template
+        from genshi.template import MarkupTemplate
+        template = MarkupTemplate(opf_template)
+        opf = template.generate(__appname__=__appname__, mi=self).render('xml')
+        opf_stream.write(opf)
+        toc = getattr(self, 'toc', None)
+        if toc is not None and ncx_stream is not None:
+            toc.render(ncx_stream, self.application_id)
    
 def option_parser():
    return get_parser('opf')
@ -649,7 +621,7 @@ def main(args=sys.argv):
    if len(args) != 2:
        parser.print_help()
        return 1
-    mi = OPFReader(open(args[1], 'rb'))
+    mi = MetaInformation(OPFReader(open(args[1], 'rb')))
    if opts.title is not None:
        mi.title = opts.title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    if opts.authors is not None:
@ -660,7 +632,8 @@ def main(args=sys.argv):
    if opts.comment is not None:
        mi.comments = opts.comment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    print mi
-    mi.write(open(args[1], 'wb'))
+    mo = OPFCreator(os.getcwd(), mi)
+    mo.render(open(args[1], 'wb'))
    return 0

 if __name__ == '__main__':
--- a/src/libprs500/ebooks/metadata/opf.xml
+++ b/src/libprs500/ebooks/metadata/opf.xml
@ -0,0 +1,36 @@
+<?xml version="1.0"  encoding="UTF-8"?>
+<package version="2.0" 
+         xmlns:opf="http://www.idpf.org/2007/opf" 
+         xmlns:py="http://genshi.edgewall.org/" 
+         unique-identifier="${__appname__}_id"
+         
+>
+    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+        <dc:title py:with="attrs={'files-as':mi.title_sort}" py:attrs="attrs">${mi.title}</dc:title>
+        <dc:creator opf:role="aut" py:for="i, author in enumerate(mi.authors)" py:with="attrs={'file-as':mi.author_sort if i==0 else None}" py:attrs="attrs">${author}</dc:creator>
+        <dc:identifier scheme="${__appname__}" id="${__appname__}_id">${mi.application_id}</dc:identifier>
+
+        <dc:type py:if="mi.category">${mi.category}</dc:type>
+        <dc:description py:if="mi.comments">${mi.comments}</dc:description>
+        <dc:publisher py:if="mi.publisher">${mi.publisher}</dc:publisher>
+        <dc:identifier opf:scheme="ISBN" py:if="mi.isbn">${mi.isbn}</dc:identifier>
+        <series py:if="mi.series">${mi.series}</series>
+        <series-index py:if="mi.series_index is not None">${mi.series_index}</series-index>
+        <rating py:if="mi.rating is not None">${mi.rating}</rating>
+        <dc:subject py:if="mi.tags is not None" py:for="tag in mi.tags">${tag}</dc:subject>
+    </metadata>
+    
+    <guide>
+        <reference py:if="mi.cover" type="cover" href="${mi.cover}" /> 
+    </guide>
+    
+    <manifest>
+        <py:for each="i, m in enumerate(mi.manifest)">
+        <item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" /> 
+        </py:for>
+    </manifest>
+    
+    <spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
+        <itemref py:for="idref in mi.spine" idref="${str(idref)}" />
+    </spine>    
+</package>
--- a/src/libprs500/ebooks/metadata/toc.py
+++ b/src/libprs500/ebooks/metadata/toc.py
@ -0,0 +1,154 @@
+#!/usr/bin/env  python
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import os, glob
+from urlparse import urlparse
+from urllib import unquote
+
+from libprs500 import __appname__
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+
+class NCXSoup(BeautifulStoneSoup):
+    
+    NESTABLE_TAGS = {'navpoint':[]}
+    
+    def __init__(self, raw):
+        BeautifulStoneSoup.__init__(self, raw,  
+                                  convertEntities=BeautifulSoup.HTML_ENTITIES,
+                                  selfClosingTags=['meta', 'content'])
+
+class TOC(list):
+    
+    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1, 
+                 base_path=os.getcwd()):
+        self.href = href
+        self.fragment = fragment
+        self.text = text
+        self.parent = parent
+        self.base_path = base_path
+        self.play_order = play_order
+        
+    def add_item(self, href, fragment, text):
+        self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
+        return self[-1]
+    
+    def top_level_items(self):
+        for item in self:
+            if item.text is not None:
+                yield item
+    
+    def depth(self):
+        depth = 1
+        for obj in self:
+            c = obj.depth()
+            if c > depth - 1:
+                depth = c + 1
+        return depth
+    
+    @apply
+    def abspath():
+        doc='Return the file this toc entry points to as a absolute path to a file on the system.'
+        def fget(self):
+            path = self.href.replace('/', os.sep)
+            if not os.path.isabs(path):
+                path = os.path.join(self.base_path, path)
+            return path
+        return property(fget=fget, doc=doc) 
+    
+    def read_from_opf(self, opfreader):
+        toc = opfreader.soup.find('spine', toc=True)
+        if toc is not None:
+            toc = toc['toc']
+        if toc is None:
+            try:
+                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
+            except:
+                for item in opfreader.manifest:
+                    if 'toc' in item.href.lower():
+                        toc = item.href
+                        break
+                            
+        if toc is not None:
+            if toc.lower() != 'ncx':
+                toc = urlparse(unquote(toc))[2]
+                toc = toc.replace('/', os.sep)
+                if not os.path.isabs(toc):
+                    toc = os.path.join(self.base_path, toc)
+                try:
+                    if not os.path.exists(toc):
+                        bn  = os.path.basename(toc)
+                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
+                        toc = os.path.join(os.path.dirname(toc), bn)
+                    
+                    self.read_html_toc(toc, self.base_path)
+                except:
+                    pass
+            else:
+                cwd = os.path.abspath(self.base_path)
+                m = glob.glob(os.path.join(cwd, '*.ncx'))
+                if m:
+                    toc = m[0]
+                    self.read_ncx_toc(toc)
+                    
+    def read_ncx_toc(self, toc):
+        self.base_path = os.path.dirname(toc)
+        soup = NCXSoup(open(toc, 'rb').read())
+        
+        def process_navpoint(np, dest):
+            play_order = np.get('playOrder', 1)
+            href = fragment = text = None
+            nl = np.find('navlabel')
+            if nl is not None:
+                text = u''
+                for txt in nl.findAll('text'):
+                    text += ''.join([unicode(s) for s in txt.findAll(text=True)])
+                content = elem.find('content')
+                if content is None or not content.has_key('src') or not txt:
+                    return
+                
+                purl = urlparse(unquote(content['src']))
+                href, fragment = purl[2], purl[5]
+            nd = dest.add_item(href, fragment, text)
+            nd.play_order = play_order
+                
+            for c in np:
+                if getattr(c, 'name', None) == 'navpoint':
+                    process_navpoint(c, nd)
+            
+        nm = soup.find('navmap')
+        for elem in nm:
+            if getattr(elem, 'name', None) == 'navpoint':
+                process_navpoint(elem, self)
+            
+        
+    def read_html_toc(self, toc):
+        self.base_path = os.path.dirname(toc)
+        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
+        for a in soup.findAll('a'):
+            if not a.has_key('href'):
+                continue
+            purl = urlparse(unquote(a['href']))
+            href, fragment = purl[2], purl[5]
+            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
+            self.add_item(href, fragment, txt)
+
+    def render(self, stream, uid):
+        from libprs500.resources import ncx_template
+        from genshi.template import MarkupTemplate
+        doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
+        template = MarkupTemplate(ncx_template)
+        raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
+        raw = raw.render(doctype=doctype)
+        stream.write(raw)
--- a/src/libprs500/ebooks/mobi/reader.py
+++ b/src/libprs500/ebooks/mobi/reader.py
@ -186,11 +186,11 @@ class MobiReader(object):
        
        if self.book_header.exth is not None:
            opf = self.create_opf(htmlfile)
-            opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
+            opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
        
    def create_opf(self, htmlfile):
        mi = self.book_header.exth.mi
-        opf = OPFCreator(mi)
+        opf = OPFCreator(os.path.dirname(htmlfile), mi)
        if hasattr(self.book_header.exth, 'cover_offset'):
            opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
        manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
@ -333,4 +333,4 @@ def main(args=sys.argv):
    return 0

 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/libprs500/library/database.py
+++ b/src/libprs500/library/database.py
@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
        mi.rating = self.rating(idx, index_is_id=index_is_id)
        mi.isbn = self.isbn(idx, index_is_id=index_is_id)
        id = idx if index_is_id else self.id(idx)        
-        mi.libprs_id = id
+        mi.application_id = id
        return mi
    
    def vacuum(self):
@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
                name += '_'+id
                base  = dir if single_dir else tpath
                
-                mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id))
+                mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
                cover = self.cover(idx, index_is_id=index_is_id)
                if cover is not None:
                    cname = name + '.jpg'
@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
                    open(cpath, 'wb').write(cover)
                    mi.cover = cname
                f = open(os.path.join(base, name+'.opf'), 'wb')
-                mi.write(f)
+                mi.render(f)
                f.close()
                
                for fmt in self.formats(idx, index_is_id=index_is_id).split(','):
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@ -44,6 +44,7 @@ entry_points = {
                             'rtf2lrf   = libprs500.ebooks.lrf.rtf.convert_from:main',
                             'web2disk  = libprs500.web.fetch.simple:main',
                             'feeds2disk = libprs500.web.feeds.main:main',
+                             'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
                             'web2lrf   = libprs500.ebooks.lrf.web.convert_from:main',
                             'pdf2lrf   = libprs500.ebooks.lrf.pdf.convert_from:main',
                             'mobi2lrf  = libprs500.ebooks.lrf.mobi.convert_from:main',
--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@ -201,6 +201,7 @@ class ProgressBar:
        self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
        (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
        self.term.CLEAR_EOL + msg)
+        sys.stdout.flush()
    
    def clear(self):
        if not self.cleared:
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -17,12 +17,13 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, time, itertools, traceback
+import logging, os, cStringIO, time, traceback
 import urlparse

 from libprs500 import browser, __appname__
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 from libprs500.ebooks.metadata.opf import OPFCreator
+from libprs500.ebooks.metadata.toc import TOC
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.web.feeds import feed_from_xml, templates
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
    #: using cp1252. If None, try to detect the encoding. 
    encoding = None
    
+    #: Specify any extra CSS that should be addded to downloaded HTML files
+    extra_css = None
+    
    #: List of regular expressions that determines which links to follow
    #: If empty, it is ignored.
    #: Only one of L{match_regexps} or L{filter_regexps} should be defined
@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
            
        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
-                      'preprocess_html', 'remove_tags_after', 'postprocess_html'):
+                      'preprocess_html', 'remove_tags_after'):
            setattr(self.web2disk_options, extra, getattr(self, extra))
+        self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
        
        if self.delay > 0:
            self.simultaneous_downloads = 1
@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
        self.failed_downloads = []
        self.partial_failures = []
            
+    def _postprocess_html(self, soup):
+        if self.extra_css is not None:
+            head = soup.find('head')
+            if head:
+                style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
+                head.insert(len(head.contents), style)
+        return soup
+    
    def download(self):
        '''
        Download and pre-process all articles from the feeds in this recipe. 
@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
        @rtype: string
        '''
        self.report_progress(0, _('Trying to download cover...'))
+        
        self.download_cover()
        res = self.build_index()
        self.cleanup()
@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
        fetcher.current_dir = dir
        fetcher.show_progress = False
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
-        if not res:
+        if not res or not os.path.exists(res):
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
        return res, path, failures
    
@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
        if dir is None:
            dir = self.output_dir
        mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
-        opf = OPFCreator(mi)
        opf_path = os.path.join(dir, 'index.opf')
+        ncx_path = os.path.join(dir, 'index.ncx')
+        opf = OPFCreator(dir, mi)
        
+        
+        manifest = ['feed_%d'%i for i in range(len(feeds))]
+        manifest.append('index.html')
        cpath = getattr(self, 'cover_path', None) 
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
+            manifest.append(cpath)
+        opf.create_manifest_from_files_in(manifest)
        
        entries = ['index.html']
+        toc = TOC(base_path=dir)
        for i, f in enumerate(feeds):
            entries.append('feed_%d/index.html'%i)
+            feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(i, j)
                    entries.append('%sindex.html'%adir)
+                    feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        
-        opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
        opf.create_spine(entries)
-        opf.write(open(opf_path, 'wb'))
+        opf.set_toc(toc)
+        
+        for i, f in enumerate(feeds):
+            
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(i, j)
+                    
+        opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
        
    
    def article_downloaded(self, request, result):
@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
                title, url = None, obj
            else:
                title, url = obj
-            self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
+            self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
            parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), 
                                              title=title,
                                              oldest_article=self.oldest_article,
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -33,15 +33,15 @@ class Newsweek(BasicNewsRecipe):
             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
             'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
-             'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria', 
+             'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
-             'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill', 
+             'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
             ]
    
-    extra_css = '#content { font:serif,120%; }'
+    extra_css = '#content { font:serif 1.2em; }'
    keep_only_tags = [dict(name='div', id='content')]

    remove_tags = [
@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    
    # For testing
-    #feeds = feeds[:2]
-    #max_articles_per_feed = 1
+    #feeds = feeds[3:5]
+    #max_articles_per_feed = 2
    
    
    
@ -91,4 +91,4 @@ class Newsweek(BasicNewsRecipe):
            img = soup.find(alt='Cover')
            if img is not None and img.has_key('src'):
                small = img['src']
-                return small.replace('coversmall', 'coverlarge')
+                return small.replace('coversmall', 'coverlarge')
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -57,16 +57,17 @@ class NavBarTemplate(Template):
    <body>
        <div class="navbar" style="text-align:center">
            <hr py:if="bottom" />
-            <a href="../index.html#article_${str(art)}">Up one level</a> 
+            <py:if test="art != num - 1">
+            | <a href="../article_${str(art+1)}/index.html">Next</a>
+            </py:if>
+            | <a href="../index.html#article_${str(art)}">Up one level</a> 
            <py:if test="two_levels">
            | <a href="../../index.html#_${str(feed)}">Up two levels</a>
            </py:if>
            <py:if test="art != 0">
            | <a href="../article_${str(art-1)}/index.html">Previous</a>
            </py:if>
-            <py:if test="art != num - 1">
-            | <a href="../article_${str(art+1)}/index.html">Next</a>
-            </py:if>
+            |
            <hr py:if="not bottom" />
        </div>
    </body>
@ -159,4 +160,4 @@ class FeedTemplate(Template):
 ''')
        
    def generate(self, feed):
-        return Template.generate(self, feed=feed)
+        return Template.generate(self, feed=feed)
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -38,9 +38,9 @@ def basename(url):

 def save_soup(soup, target):
    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    for meta in soup.find('meta', content=True):
-        if 'charset' in meta['content']:
-            meta.replaceWith(nm)
+    meta = soup.find('meta', content=True)
+    if meta and 'charset' in meta['content']:
+        meta.replaceWith(nm)
    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))
    f.close()
@ -85,7 +85,7 @@ class RecursiveFetcher(object):
        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
-        self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
+        self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
@ -336,7 +336,9 @@ class RecursiveFetcher(object):
                        self.process_return_links(soup, iurl) 
                        self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
                    
-                    save_soup(self.postprocess_html_ext(soup), res)
+                    for func in self.postprocess_html_ext:
+                        soup = func(soup)
+                    save_soup(soup, res)
                    
                    self.localize_link(tag, 'href', res)
                except Exception, err: