Refactored OPF creation code. Implemented Table of Contents support in feeds2disk.

2025-07-09 03:04:10 -04:00 · 2008-03-14 19:25:48 +00:00 · 2008-03-14 19:25:48 +00:00 · 6982652f92
commit 6982652f92
parent 748c184ccb
18 changed files with 482 additions and 209 deletions
--- a/6
+++ b/6
@ -1,6 +1,6 @@
 PYTHON = python
-all : gui2 translations
+all : gui2 translations resources
 clean : 
 	cd src/libprs500/gui2 && ${PYTHON} make.py clean
@ -14,3 +14,7 @@ test : gui2
 translations :
 	cd src/libprs500 && ${PYTHON} translations/__init__.py
 resources:	
 	${PYTHON} resources.py
--- a/resources.py
+++ b/resources.py
@ -0,0 +1,39 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Compile resource files.
 '''
 import os, sys
 sys.path.insert(1, os.path.join(os.getcwd(), 'src'))
 from libprs500 import __appname__
 RESOURCES = dict(
    opf_template = '%p/ebooks/metadata/opf.xml',
    ncx_template = '%p/ebooks/metadata/ncx.xml',
                 )
 def main(args=sys.argv):
    data = ''
    for key, value in RESOURCES.items():
        path = value.replace('%p', 'src'+os.sep+__appname__)
        bytes = repr(open(path, 'rb').read())
        data += key + ' = ' + bytes + '\n\n'
    open('src'+os.sep+__appname__+os.sep+'/resources.py', 'wb').write(data) 
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -60,6 +60,8 @@ def update_css(ncss, ocss):
 def munge_paths(basepath, url):
    purl = urlparse(unquote(url),)
    path, fragment = purl[2], purl[5]
    if path:
        path = path.replace('/', os.sep)
    if not path:
        path = basepath
    elif not os.path.isabs(path):
@ -223,7 +225,6 @@ class HTMLConverter(object):
        self.extra_toc_entries = [] #: TOC entries gleaned from semantic information
        self.image_memory = []
        self.id_counter = 0
        self.toc_from_metadata = False #: If True means that the toc has been populated from metadata
        self.unused_target_blocks = [] #: Used to remove extra TextBlocks
        self.link_level  = 0    #: Current link level
        self.memory = []        #: Used to ensure that duplicate CSS unhandled erros are not reported
@ -543,7 +544,7 @@ class HTMLConverter(object):
        path, fragment = munge_paths(self.target_prefix, tag['href'])
        return {'para':para, 'text':text, 'path':os.path.abspath(path), 
-                'fragment':fragment, 'in toc': (self.link_level == 0 and not self.toc_from_metadata)}
+                'fragment':fragment, 'in toc': (self.link_level == 0 and not self.use_spine)}
    def get_text(self, tag, limit=None):
@ -637,13 +638,12 @@ class HTMLConverter(object):
        return outside_links
    def create_toc(self, toc):
-        for (path, fragment, txt) in toc:
+        for item in toc.top_level_items():
-            ascii_text = txt.encode('ascii', 'ignore') # Bug in SONY LRF renderer
+            ascii_text = item.text.encode('ascii', 'ignore') # Bug in SONY LRF renderer
-            self.toc_from_metadata = True
+            if not item.fragment and item.abspath in self.tops:
-            if not fragment and path in self.tops:
+                self.book.addTocEntry(ascii_text, self.tops[item.abspath])                
                self.book.addTocEntry(ascii_text, self.tops[path])                
            else:
-                url = path+fragment
+                url = item.abspath+item.fragment
                if url in self.targets:
                    self.book.addTocEntry(ascii_text, self.targets[url])
@ -1846,6 +1846,7 @@ def try_opf(path, options, logger):
            options.cover = None
            cover = opf.cover            
            if cover:
                cover = cover.replace('/', os.sep)
                if not os.path.isabs(cover):
                    cover = os.path.join(dirpath, cover)
                if os.access(cover, os.R_OK):
--- a/src/libprs500/ebooks/lrf/html/convert_to.py
+++ b/src/libprs500/ebooks/lrf/html/convert_to.py
@ -65,7 +65,7 @@ class LRFConverter(object):
    def create_metadata(self):
        self.logger.info('Reading metadata...')
        mi = get_metadata(self.lrf)
-        self.opf = OPFCreator(mi)
+        self.opf = OPFCreator(self.output_dir, mi)
    def create_page_styles(self):
        self.page_css = ''
--- a/src/libprs500/ebooks/metadata/init.py
+++ b/src/libprs500/ebooks/metadata/init.py
@ -45,12 +45,13 @@ class MetaInformation(object):
        ans = MetaInformation(mi.title, mi.authors)
        for attr in ('author_sort', 'title_sort', 'comments', 'category',
                     'publisher', 'series', 'series_index', 'rating',
-                     'isbn', 'tags', 'cover_data', 'libprs_id'):
+                     'isbn', 'tags', 'cover_data', 'application_id',
                     'manifest', 'spine', 'toc', 'cover'):
            if hasattr(mi, attr):
                setattr(ans, attr, getattr(mi, attr))
-    def __init__(self, title, authors):
+    def __init__(self, title, authors=['Unknown']):
        '''
        @param title: title or "Unknown" or a MetaInformation object
        @param authors: List of strings or []
@ -76,8 +77,11 @@ class MetaInformation(object):
        self.isbn         = None if not mi else mi.isbn
        self.tags         = []  if not mi else mi.tags
        self.cover_data   = mi.cover_data if (mi and hasattr(mi, 'cover_data')) else (None, None)
-        self.libprs_id    = mi.libprs_id  if (mi and hasattr(mi, 'libprs_id')) else None
+        self.application_id    = mi.application_id  if (mi and hasattr(mi, 'application_id')) else None
-         
+        self.manifest = getattr(mi, 'manifest', None) 
        self.toc      = getattr(mi, 'toc', None)
        self.spine    = getattr(mi, 'spine', None)
        self.cover    = getattr(mi, 'cover', None)
    def smart_update(self, mi):
        '''
@ -92,7 +96,7 @@ class MetaInformation(object):
        for attr in ('author_sort', 'title_sort', 'comments', 'category',
                     'publisher', 'series', 'series_index', 'rating',
-                     'isbn', 'libprs_id'):
+                     'isbn', 'application_id', 'manifest', 'spine', 'toc', 'cover'):
            if hasattr(mi, attr):
                val = getattr(mi, attr)
                if val is not None:
--- a/src/libprs500/ebooks/metadata/meta.py
+++ b/src/libprs500/ebooks/metadata/meta.py
@ -51,7 +51,7 @@ def metadata_from_formats(formats):
        ext = path_to_ext(path)
        stream = open(path, 'rb')
        mi.smart_update(get_metadata(stream, stream_type=ext, use_libprs_metadata=True))
-        if getattr(mi, 'libprs_id', None) is not None:
+        if getattr(mi, 'application_id', None) is not None:
            return mi
    return mi
@ -69,7 +69,7 @@ def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False):
        if os.access(c, os.R_OK):
            opf = opf_metadata(os.path.abspath(c))
-    if use_libprs_metadata and getattr(opf, 'libprs_id', None) is not None:
+    if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
        return opf
    try:
@ -147,7 +147,7 @@ def opf_metadata(opfpath):
    f = open(opfpath, 'rb')
    opf = OPFReader(f, os.path.dirname(opfpath))
    try:
-        if opf.libprs_id is not None:
+        if opf.application_id is not None:
            mi = MetaInformation(opf, None)
            if hasattr(opf, 'cover') and opf.cover:
                cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
--- a/src/libprs500/ebooks/metadata/ncx.xml
+++ b/src/libprs500/ebooks/metadata/ncx.xml
@ -0,0 +1,27 @@
 <ncx version="2005-1" 
     xml:lang="en" 
     xmlns="http://www.daisy.org/z3986/2005/ncx/"
     xmlns:py="http://genshi.edgewall.org/"
 >
    <head>
        <meta name="dtb:uid" content="${uid}"/>
        <meta name="dtb:depth" content="${toc.depth()}"/>
        <meta name="dtb:generator" content="${__appname__}"/>
        <meta name="dtb:totalPageCount" content="0"/>
        <meta name="dtb:maxPageNumber" content="0"/>
    </head>
    <docTitle><text>Table of Contents</text></docTitle>
    <py:def function="navpoint(np, level)">
        ${'%*s'%(4*level,'')}<navPoint playOrder="${str(np.play_order)}">
            ${'%*s'%(4*level,'')}<navLabel>
                ${'%*s'%(4*level,'')}<text>${np.text}</text>
            ${'%*s'%(4*level,'')}</navLabel>
            ${'%*s'%(4*level,'')}<content src="${str(np.href)+(('#' + str(np.fragment)) if np.fragment else '')}" />
            <py:for each="np2 in np">${navpoint(np2, level+1)}</py:for>
        ${'%*s'%(4*level,'')}</navPoint>
    </py:def>
    <navMap>
    <py:for each="np in toc">${navpoint(np, 0)}</py:for>
    </navMap>
 </ncx>
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -12,18 +12,21 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import uuid
 '''Read/Write metadata from Open Packaging Format (.opf) files.'''
-import sys, re, os, glob
+import sys, re, os, mimetypes
 from urllib import unquote
 from urlparse import urlparse
 import xml.dom.minidom as dom
 from itertools import repeat
 from libprs500 import __appname__
 from libprs500.ebooks.metadata import MetaInformation
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 from libprs500.ebooks.lrf import entity_to_unicode
 from libprs500.ebooks.metadata import get_parser
 from libprs500.ebooks.metadata.toc import TOC
 class ManifestItem(object):
    def __init__(self, item, cwd):
@ -41,6 +44,14 @@ class ManifestItem(object):
    def __unicode__(self):
        return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href, self.media_type)
    def __getitem__(self, index):
        if index == 0:
            return self.href
        if index == 1:
            return self.media_type
        raise IndexError('%d out of bounds.'%index)
 class Manifest(list):
    def __init__(self, soup, dir):
@ -82,84 +93,10 @@ class Spine(object):
        for i in self.linear_ids + self.nonlinear_ids:
            yield  self.manifest.item(i)
-class TOC(list):
+    def __iter__(self):
        for i in self.linear_ids + self.nonlinear_ids:
            yield i
    def __init__(self, opfreader, cwd):
        self.toc = None
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
            except:
                for item in opfreader.manifest:
                    if 'toc' in item.href.lower():
                        toc = item.href
                        break
        if toc is not None:
            if toc.lower() != 'ncx':
                toc = urlparse(unquote(toc))[2]
                if not os.path.isabs(toc):
                    toc = os.path.join(cwd, toc)
                try:
                    if not os.path.exists(toc):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)
                    self.read_html_toc(toc, cwd)
                    self.toc = toc
                except:
                    pass
            else:
                cwd = os.path.abspath(cwd)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    try:
                        self.read_ncx_toc(toc)
                        self.toc = toc
                    except:
                        raise
                        pass
    def read_ncx_toc(self, toc):
        bdir = os.path.dirname(toc)
        soup = BeautifulStoneSoup(open(toc, 'rb').read(),
                                  convertEntities=BeautifulSoup.HTML_ENTITIES)
        elems = soup.findAll('navpoint')
        elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
        for elem in elems:
            txt = u''
            for nl in elem.findAll('navlabel'):
                for text in nl.findAll('text'):
                    txt += ''.join([unicode(s) for s in text.findAll(text=True)])
            content = elem.find('content')
            if content is None or not content.has_key('src') or not txt:
                continue
            purl = urlparse(unquote(content['src']))
            href, fragment = purl[2], purl[5]
            if not os.path.isabs(href):
                href = os.path.join(bdir, href)
            self.append((href, fragment, txt))
    def read_html_toc(self, toc, cwd):
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not os.path.isabs(href):
                href = os.path.join(cwd, href)
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            self.append((href, fragment, txt))
 class standard_field(object):
@ -178,21 +115,21 @@ class OPF(MetaInformation):
    MIMETYPE = 'application/oebps-package+xml'
    ENTITY_PATTERN = re.compile(r'&(\S+?);')
-    uid           = standard_field('uid')
+    uid            = standard_field('uid')
-    libprs_id     = standard_field('libprs_id')
+    application_id = standard_field('application_id')
-    title         = standard_field('title')
+    title          = standard_field('title')
-    authors       = standard_field('authors')
+    authors        = standard_field('authors')
-    title_sort    = standard_field('title_sort')
+    title_sort     = standard_field('title_sort')
-    author_sort   = standard_field('author_sort')
+    author_sort    = standard_field('author_sort')
-    comments      = standard_field('comments')
+    comments       = standard_field('comments')
-    category      = standard_field('category')
+    category       = standard_field('category')
-    publisher     = standard_field('publisher')
+    publisher      = standard_field('publisher')
-    isbn          = standard_field('isbn')
+    isbn           = standard_field('isbn')
-    cover         = standard_field('cover')
+    cover          = standard_field('cover')
-    series        = standard_field('series')
+    series         = standard_field('series')
-    series_index  = standard_field('series_index')
+    series_index   = standard_field('series_index')
-    rating        = standard_field('rating')
+    rating         = standard_field('rating')
-    tags          = standard_field('tags')
+    tags           = standard_field('tags')
    HEADER = '''\
 <?xml version="1.0" encoding="UTF-8"?>
@ -207,14 +144,14 @@ class OPF(MetaInformation):
        if not hasattr(self, 'soup'):
            self.soup = BeautifulStoneSoup(u'''\
 %s
-<package unique-identifier="libprs_id">
+<package unique-identifier="%s_id">
    <metadata>
        <dc-metadata
         xmlns:dc="http://purl.org/dc/elements/1.1/"
         xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/" />
    </metadata>
 </package>
-'''%self.HEADER)
+'''%(__appname__, self.HEADER))
    def _commit(self, doc):
        self.soup = BeautifulStoneSoup(doc.toxml('utf-8'), fromEncoding='utf-8')
@ -403,15 +340,15 @@ class OPF(MetaInformation):
            self._set_metadata_element('dc:identifier', isbn, [('scheme', 'ISBN')], 
                                       replace=True)
-    def get_libprs_id(self):
+    def get_application_id(self):
        for item in self.soup.package.metadata.findAll('dc:identifier'):
-            if item.has_key('scheme') and item['scheme'] == 'libprs':
+            if item.has_key('scheme') and item['scheme'] == __appname__:
                return str(item.string).strip()
        return None
-    def set_libprs_id(self, val):
+    def set_application_id(self, val):
        if val:
-            self._set_metadata_element('dc:identifier', str(val), [('scheme', 'libprs'), ('id', 'libprs_id')], 
+            self._set_metadata_element('dc:identifier', str(val), [('scheme', __appname__), ('id', __appname__+'_id')], 
                                       replace=True)
    def get_cover(self):
@ -564,61 +501,72 @@ class OPFReader(OPF):
            stream.close()
        self.manifest = Manifest(self.soup, dir)
        self.spine = Spine(self.soup, self.manifest)
-        self.toc = TOC(self, dir)
+        self.toc = TOC()
        self.toc.read_from_opf(self)
        self.cover_data = (None, None)
-class OPFCreator(OPF):
+class OPFCreator(MetaInformation):
-    def __init__(self, mi):
+    def __init__(self, base_path, *args, **kwargs):
-        self.title = mi.title
+        '''
-        self.authors = mi.authors
+        Initialize.
-        if mi.category:
+        @param base_path: An absolute path to the directory in which this OPF file
-            self.category = mi.category
+        will eventually be. This is used by the L{create_manifest} method
-        if mi.comments:
+        to convert paths to files into relative paths.
-            self.comments = mi.comments
+        '''
-        if mi.publisher:
+        MetaInformation.__init__(self, *args, **kwargs)
-            self.publisher = mi.publisher
+        self.base_path = os.path.abspath(base_path)
-        if mi.rating:
+        if self.application_id is None:
-            self.rating = mi.rating
+            self.application_id = str(uuid.uuid4())
-        if mi.series:
+        self.toc = None
-            self.series = mi.series
+        if isinstance(self.manifest, Manifest):
-        if mi.series_index:
+            manifest = []
-            self.series_index = mi.series_index
+            for path, mt in self.manifest:
-        if mi.tags:
+                if not path.startswith(self.base_path):
-            self.tags = mi.tags
+                    raise ValueError('Inavlid manifest item %s for base path %s'%(path, self.base_path))
-        if mi.isbn:
+                path = path[len(self.base_path)+1:]
-            self.isbn = mi.isbn
+                manifest.append((path, mt))
-        self.cover_data = mi.cover_data
+            self.manifest = manifest
        if hasattr(mi, 'libprs_id'):
            self.libprs_id = mi.libprs_id
        if hasattr(mi, 'uid'):
            self.uid = mi.uid    
    def create_manifest(self, entries):
        '''
        Create <manifest>
-        @param entries: List of (URL, mime-type)
+        @param entries: List of (path, mime-type)
        @param base_path: It is used to convert each path into a path relative to itself
        @type entries: list of 2-tuples
        '''
-        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
+        rentries = []
-        package = doc.documentElement
+        base_path = self.base_path
-        manifest = doc.createElement('manifest')
+        mimetypes.init()
-        package.appendChild(manifest)
+        for href, mt in entries:
-        package.appendChild(doc.createTextNode('\n'))
+            href = os.path.abspath(href)
            if not href.startswith(base_path):
                raise ValueError('OPF should only refer to files below it. %s is above %s'%(href, base_path))
            href = href[len(base_path)+1:].replace(os.sep, '/')
            if not mt:
                mt = mimetypes.guess_type(href)[0]
                if not mt:
                    mt = ''
            rentries.append((href, mt))
-        self.href_map = {}
+        self.manifest = rentries
-        for href, media_type in entries:
+    def create_manifest_from_files_in(self, files_and_dirs):
-            item = doc.createElement('item')
+        entries = []
            item.setAttribute('href', href)
            item.setAttribute('media-type', media_type)
            self.href_map[href] = str(hash(href))
            item.setAttribute('id', self.href_map[href])
            manifest.appendChild(item)
            manifest.appendChild(doc.createTextNode('\n'))
-        self._commit(doc)
+        def dodir(dir):
            for root, dirs, files in os.walk(dir):
                for name in files:
                    path = os.path.join(root, name)
                    entries.append((path, None)) 
        for i in files_and_dirs:
            if os.path.isdir(i):
                dodir(i)
            else:
                entries.append((i, None))
        self.create_manifest(entries)    
    def create_spine(self, entries):
        '''
@ -626,19 +574,43 @@ class OPFCreator(OPF):
        @param: List of paths
        @type param: list of strings
        '''
-        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
+        self.spine = []
        package = doc.documentElement
        spine = doc.createElement('spine')
        package.appendChild(spine)
        package.appendChild(doc.createTextNode('\n'))
-        for href in entries:
+        for path in entries:
-            itemref = doc.createElement('itemref')
+            if not os.path.isabs(path):
-            itemref.setAttribute('idref', self.href_map[href])
+                path = os.path.join(self.base_path, path)
-            spine.appendChild(itemref)
+            if not path.startswith(self.base_path):
-            spine.appendChild(doc.createTextNode('\n'))
+                raise ValueError('Invalid entry %s for base path %s'%(path, self.base_path))
            href = path[len(self.base_path)+1:]
            in_manifest = False
            for i, m in enumerate(self.manifest):
                if m[0] == href:
                    in_manifest = True
                    break
            if not in_manifest:
                raise ValueError('%s is not in the manifest. (%s)'%(href, path))
            self.spine.append(i)
-        self._commit(doc)
+            
    def set_toc(self, toc):
        '''
        Set the toc. You must call L{create_spine} before calling this
        method.
        @param toc: A Table of Contents
        @type toc: L{TOC}
        '''
        self.toc = toc
    def render(self, opf_stream, ncx_stream=None):
        from libprs500.resources import opf_template
        from genshi.template import MarkupTemplate
        template = MarkupTemplate(opf_template)
        opf = template.generate(__appname__=__appname__, mi=self).render('xml')
        opf_stream.write(opf)
        toc = getattr(self, 'toc', None)
        if toc is not None and ncx_stream is not None:
            toc.render(ncx_stream, self.application_id)
 def option_parser():
    return get_parser('opf')
@ -649,7 +621,7 @@ def main(args=sys.argv):
    if len(args) != 2:
        parser.print_help()
        return 1
-    mi = OPFReader(open(args[1], 'rb'))
+    mi = MetaInformation(OPFReader(open(args[1], 'rb')))
    if opts.title is not None:
        mi.title = opts.title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    if opts.authors is not None:
@ -660,7 +632,8 @@ def main(args=sys.argv):
    if opts.comment is not None:
        mi.comments = opts.comment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    print mi
-    mi.write(open(args[1], 'wb'))
+    mo = OPFCreator(os.getcwd(), mi)
    mo.render(open(args[1], 'wb'))
    return 0
 if __name__ == '__main__':
--- a/src/libprs500/ebooks/metadata/opf.xml
+++ b/src/libprs500/ebooks/metadata/opf.xml
@ -0,0 +1,36 @@
 <?xml version="1.0"  encoding="UTF-8"?>
 <package version="2.0" 
         xmlns:opf="http://www.idpf.org/2007/opf" 
         xmlns:py="http://genshi.edgewall.org/" 
         unique-identifier="${__appname__}_id"
 >
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
        <dc:title py:with="attrs={'files-as':mi.title_sort}" py:attrs="attrs">${mi.title}</dc:title>
        <dc:creator opf:role="aut" py:for="i, author in enumerate(mi.authors)" py:with="attrs={'file-as':mi.author_sort if i==0 else None}" py:attrs="attrs">${author}</dc:creator>
        <dc:identifier scheme="${__appname__}" id="${__appname__}_id">${mi.application_id}</dc:identifier>
        <dc:type py:if="mi.category">${mi.category}</dc:type>
        <dc:description py:if="mi.comments">${mi.comments}</dc:description>
        <dc:publisher py:if="mi.publisher">${mi.publisher}</dc:publisher>
        <dc:identifier opf:scheme="ISBN" py:if="mi.isbn">${mi.isbn}</dc:identifier>
        <series py:if="mi.series">${mi.series}</series>
        <series-index py:if="mi.series_index is not None">${mi.series_index}</series-index>
        <rating py:if="mi.rating is not None">${mi.rating}</rating>
        <dc:subject py:if="mi.tags is not None" py:for="tag in mi.tags">${tag}</dc:subject>
    </metadata>
    <guide>
        <reference py:if="mi.cover" type="cover" href="${mi.cover}" /> 
    </guide>
    <manifest>
        <py:for each="i, m in enumerate(mi.manifest)">
        <item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" /> 
        </py:for>
    </manifest>
    <spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
        <itemref py:for="idref in mi.spine" idref="${str(idref)}" />
    </spine>    
 </package>
--- a/src/libprs500/ebooks/metadata/toc.py
+++ b/src/libprs500/ebooks/metadata/toc.py
@ -0,0 +1,154 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import os, glob
 from urlparse import urlparse
 from urllib import unquote
 from libprs500 import __appname__
 from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
 class NCXSoup(BeautifulStoneSoup):
    NESTABLE_TAGS = {'navpoint':[]}
    def __init__(self, raw):
        BeautifulStoneSoup.__init__(self, raw,  
                                  convertEntities=BeautifulSoup.HTML_ENTITIES,
                                  selfClosingTags=['meta', 'content'])
 class TOC(list):
    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=1, 
                 base_path=os.getcwd()):
        self.href = href
        self.fragment = fragment
        self.text = text
        self.parent = parent
        self.base_path = base_path
        self.play_order = play_order
    def add_item(self, href, fragment, text):
        self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path))
        return self[-1]
    def top_level_items(self):
        for item in self:
            if item.text is not None:
                yield item
    def depth(self):
        depth = 1
        for obj in self:
            c = obj.depth()
            if c > depth - 1:
                depth = c + 1
        return depth
    @apply
    def abspath():
        doc='Return the file this toc entry points to as a absolute path to a file on the system.'
        def fget(self):
            path = self.href.replace('/', os.sep)
            if not os.path.isabs(path):
                path = os.path.join(self.base_path, path)
            return path
        return property(fget=fget, doc=doc) 
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
            except:
                for item in opfreader.manifest:
                    if 'toc' in item.href.lower():
                        toc = item.href
                        break
        if toc is not None:
            if toc.lower() != 'ncx':
                toc = urlparse(unquote(toc))[2]
                toc = toc.replace('/', os.sep)
                if not os.path.isabs(toc):
                    toc = os.path.join(self.base_path, toc)
                try:
                    if not os.path.exists(toc):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)
                    self.read_html_toc(toc, self.base_path)
                except:
                    pass
            else:
                cwd = os.path.abspath(self.base_path)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
    def read_ncx_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = NCXSoup(open(toc, 'rb').read())
        def process_navpoint(np, dest):
            play_order = np.get('playOrder', 1)
            href = fragment = text = None
            nl = np.find('navlabel')
            if nl is not None:
                text = u''
                for txt in nl.findAll('text'):
                    text += ''.join([unicode(s) for s in txt.findAll(text=True)])
                content = elem.find('content')
                if content is None or not content.has_key('src') or not txt:
                    return
                purl = urlparse(unquote(content['src']))
                href, fragment = purl[2], purl[5]
            nd = dest.add_item(href, fragment, text)
            nd.play_order = play_order
            for c in np:
                if getattr(c, 'name', None) == 'navpoint':
                    process_navpoint(c, nd)
        nm = soup.find('navmap')
        for elem in nm:
            if getattr(elem, 'name', None) == 'navpoint':
                process_navpoint(elem, self)
    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            self.add_item(href, fragment, txt)
    def render(self, stream, uid):
        from libprs500.resources import ncx_template
        from genshi.template import MarkupTemplate
        doctype = ('ncx', "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd")
        template = MarkupTemplate(ncx_template)
        raw = template.generate(uid=uid, toc=self, __appname__=__appname__)
        raw = raw.render(doctype=doctype)
        stream.write(raw)
--- a/src/libprs500/ebooks/mobi/reader.py
+++ b/src/libprs500/ebooks/mobi/reader.py
@ -186,11 +186,11 @@ class MobiReader(object):
        if self.book_header.exth is not None:
            opf = self.create_opf(htmlfile)
-            opf.write(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
+            opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'))
    def create_opf(self, htmlfile):
        mi = self.book_header.exth.mi
-        opf = OPFCreator(mi)
+        opf = OPFCreator(os.path.dirname(htmlfile), mi)
        if hasattr(self.book_header.exth, 'cover_offset'):
            opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
        manifest = [(os.path.basename(htmlfile), 'text/x-oeb1-document')]
--- a/src/libprs500/library/database.py
+++ b/src/libprs500/library/database.py
@ -1340,7 +1340,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
        mi.rating = self.rating(idx, index_is_id=index_is_id)
        mi.isbn = self.isbn(idx, index_is_id=index_is_id)
        id = idx if index_is_id else self.id(idx)        
-        mi.libprs_id = id
+        mi.application_id = id
        return mi
    def vacuum(self):
@ -1382,7 +1382,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
                name += '_'+id
                base  = dir if single_dir else tpath
-                mi = OPFCreator(self.get_metadata(idx, index_is_id=index_is_id))
+                mi = OPFCreator(base, self.get_metadata(idx, index_is_id=index_is_id))
                cover = self.cover(idx, index_is_id=index_is_id)
                if cover is not None:
                    cname = name + '.jpg'
@ -1390,7 +1390,7 @@ ALTER TABLE books ADD COLUMN isbn TEXT DEFAULT "" COLLATE NOCASE;
                    open(cpath, 'wb').write(cover)
                    mi.cover = cname
                f = open(os.path.join(base, name+'.opf'), 'wb')
-                mi.write(f)
+                mi.render(f)
                f.close()
                for fmt in self.formats(idx, index_is_id=index_is_id).split(','):
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@ -44,6 +44,7 @@ entry_points = {
                             'rtf2lrf   = libprs500.ebooks.lrf.rtf.convert_from:main',
                             'web2disk  = libprs500.web.fetch.simple:main',
                             'feeds2disk = libprs500.web.feeds.main:main',
                             'feeds2lrf = libprs500.ebooks.lrf.feeds.convert_from:main',
                             'web2lrf   = libprs500.ebooks.lrf.web.convert_from:main',
                             'pdf2lrf   = libprs500.ebooks.lrf.pdf.convert_from:main',
                             'mobi2lrf  = libprs500.ebooks.lrf.mobi.convert_from:main',
--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@ -201,6 +201,7 @@ class ProgressBar:
        self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
        (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
        self.term.CLEAR_EOL + msg)
        sys.stdout.flush()
    def clear(self):
        if not self.cleared:
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -17,12 +17,13 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, time, itertools, traceback
+import logging, os, cStringIO, time, traceback
 import urlparse
 from libprs500 import browser, __appname__
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 from libprs500.ebooks.metadata.opf import OPFCreator
 from libprs500.ebooks.metadata.toc import TOC
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.web.feeds import feed_from_xml, templates
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
@ -94,6 +95,9 @@ class BasicNewsRecipe(object):
    #: using cp1252. If None, try to detect the encoding. 
    encoding = None
    #: Specify any extra CSS that should be addded to downloaded HTML files
    extra_css = None
    #: List of regular expressions that determines which links to follow
    #: If empty, it is ignored.
    #: Only one of L{match_regexps} or L{filter_regexps} should be defined
@ -276,8 +280,9 @@ class BasicNewsRecipe(object):
        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
-                      'preprocess_html', 'remove_tags_after', 'postprocess_html'):
+                      'preprocess_html', 'remove_tags_after'):
            setattr(self.web2disk_options, extra, getattr(self, extra))
        self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
        if self.delay > 0:
            self.simultaneous_downloads = 1
@ -288,6 +293,14 @@ class BasicNewsRecipe(object):
        self.failed_downloads = []
        self.partial_failures = []
    def _postprocess_html(self, soup):
        if self.extra_css is not None:
            head = soup.find('head')
            if head:
                style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
                head.insert(len(head.contents), style)
        return soup
    def download(self):
        '''
        Download and pre-process all articles from the feeds in this recipe. 
@ -297,6 +310,7 @@ class BasicNewsRecipe(object):
        @rtype: string
        '''
        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        res = self.build_index()
        self.cleanup()
@ -362,7 +376,7 @@ class BasicNewsRecipe(object):
        fetcher.current_dir = dir
        fetcher.show_progress = False
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
-        if not res:
+        if not res or not os.path.exists(res):
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
        return res, path, failures
@ -446,28 +460,44 @@ class BasicNewsRecipe(object):
        if dir is None:
            dir = self.output_dir
        mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
        opf = OPFCreator(mi)
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
        opf = OPFCreator(dir, mi)
        manifest = ['feed_%d'%i for i in range(len(feeds))]
        manifest.append('index.html')
        cpath = getattr(self, 'cover_path', None) 
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)
        opf.create_manifest_from_files_in(manifest)
        entries = ['index.html']
        toc = TOC(base_path=dir)
        for i, f in enumerate(feeds):
            entries.append('feed_%d/index.html'%i)
            feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(i, j)
                    entries.append('%sindex.html'%adir)
                    feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
        opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
        opf.create_spine(entries)
-        opf.write(open(opf_path, 'wb'))
+        opf.set_toc(toc)
        for i, f in enumerate(feeds):
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(i, j)
        opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
    def article_downloaded(self, request, result):
@ -516,7 +546,7 @@ class BasicNewsRecipe(object):
                title, url = None, obj
            else:
                title, url = obj
-            self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
+            self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
            parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), 
                                              title=title,
                                              oldest_article=self.oldest_article,
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -41,7 +41,7 @@ class Newsweek(BasicNewsRecipe):
             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
             ]
-    extra_css = '#content { font:serif,120%; }'
+    extra_css = '#content { font:serif 1.2em; }'
    keep_only_tags = [dict(name='div', id='content')]
    remove_tags = [
@ -55,8 +55,8 @@ class Newsweek(BasicNewsRecipe):
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    # For testing
-    #feeds = feeds[:2]
+    #feeds = feeds[3:5]
-    #max_articles_per_feed = 1
+    #max_articles_per_feed = 2
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -57,16 +57,17 @@ class NavBarTemplate(Template):
    <body>
        <div class="navbar" style="text-align:center">
            <hr py:if="bottom" />
-            <a href="../index.html#article_${str(art)}">Up one level</a> 
+            <py:if test="art != num - 1">
            | <a href="../article_${str(art+1)}/index.html">Next</a>
            </py:if>
            | <a href="../index.html#article_${str(art)}">Up one level</a> 
            <py:if test="two_levels">
            | <a href="../../index.html#_${str(feed)}">Up two levels</a>
            </py:if>
            <py:if test="art != 0">
            | <a href="../article_${str(art-1)}/index.html">Previous</a>
            </py:if>
-            <py:if test="art != num - 1">
+            |
            | <a href="../article_${str(art+1)}/index.html">Next</a>
            </py:if>
            <hr py:if="not bottom" />
        </div>
    </body>
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -38,9 +38,9 @@ def basename(url):
 def save_soup(soup, target):
    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    for meta in soup.find('meta', content=True):
+    meta = soup.find('meta', content=True)
-        if 'charset' in meta['content']:
+    if meta and 'charset' in meta['content']:
-            meta.replaceWith(nm)
+        meta.replaceWith(nm)
    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))
    f.close()
@ -85,7 +85,7 @@ class RecursiveFetcher(object):
        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
-        self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
+        self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
@ -336,7 +336,9 @@ class RecursiveFetcher(object):
                        self.process_return_links(soup, iurl) 
                        self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
-                    save_soup(self.postprocess_html_ext(soup), res)
+                    for func in self.postprocess_html_ext:
                        soup = func(soup)
                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception, err: