Initial implementation of feeds2disk

2025-07-07 10:14:46 -04:00 · 2008-03-11 17:06:54 +00:00 · 2008-03-11 17:06:54 +00:00 · ae28c0a164
commit ae28c0a164
parent 829267da44
8 changed files with 893 additions and 66 deletions
--- a/src/libprs500/manual/templates/basic.html
+++ b/src/libprs500/manual/templates/basic.html
@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
-    "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
-
-    <head>
-        <meta name="author" content="Kovid Goyal" />
-        <meta name="copyright" content="&copy; 2008 Kovid Goyal" />
-        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
-        <title></title>
-        <link rel="stylesheet" type="text/css" href="styles/common.css" />
-    </head>
-
-    <body>
-
-        %body
-
-        <hr />
-        <div class="footer">
-            <p>
-                <a href="http://validator.w3.org/check?uri=referer">
-                    <img src="images/valid.png" alt="Valid XHTML 1.1" height="31" width="88" />
-                </a><br />
-                Created by Kovid Goyal &copy; 2008
-            </p>
-        </div>
-    </body>
-
-</html>
--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@ -190,14 +190,17 @@ class ProgressBar:
        self.cleared = 1 #: true if we haven't drawn the bar yet.
        
    def update(self, percent, message=''):
+        if isinstance(message, unicode):
+            message = message.encode('utf-8', 'ignore')
        if self.cleared:
            sys.stdout.write(self.header)
            self.cleared = 0
        n = int((self.width-10)*percent)
+        msg = message.center(self.width)
        sys.stdout.write(
        self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
        (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
-        self.term.CLEAR_EOL + message.center(self.width))
+        self.term.CLEAR_EOL + msg)
    
    def clear(self):
        if not self.cleared:
--- a/src/libprs500/threadpool.py
+++ b/src/libprs500/threadpool.py
@ -0,0 +1,331 @@
+"""Easy to use object-oriented thread pool framework.
+
+A thread pool is an object that maintains a pool of worker threads to perform
+time consuming operations in parallel. It assigns jobs to the threads
+by putting them in a work request queue, where they are picked up by the
+next available thread. This then performs the requested operation in the
+background and puts the results in a another queue.
+
+The thread pool object can then collect the results from all threads from
+this queue as soon as they become available or after all threads have
+finished their work. It's also possible, to define callbacks to handle
+each result as it comes in.
+
+The basic concept and some code was taken from the book "Python in a Nutshell"
+by Alex Martelli, copyright 2003, ISBN 0-596-00188-6, from section 14.5
+"Threaded Program Architecture". I wrapped the main program logic in the
+ThreadPool class, added the WorkRequest class and the callback system and
+tweaked the code here and there. Kudos also to Florent Aide for the exception
+handling mechanism.
+
+Basic usage:
+
+>>> pool = TreadPool(poolsize)
+>>> requests = makeRequests(some_callable, list_of_args, callback)
+>>> [pool.putRequest(req) for req in requests]
+>>> pool.wait()
+
+See the end of the module code for a brief, annotated usage example.
+
+Website : http://chrisarndt.de/en/software/python/threadpool/
+"""
+
+__all__ = [
+  'makeRequests',
+  'NoResultsPending',
+  'NoWorkersAvailable',
+  'ThreadPool',
+  'WorkRequest',
+  'WorkerThread'
+]
+
+__author__ = "Christopher Arndt"
+__version__ = "1.2.3"
+__revision__ = "$Revision: 1.5 $"
+__date__ = "$Date: 2006/06/23 12:32:25 $"
+__license__ = 'Python license'
+
+# standard library modules
+import sys
+import threading
+import Queue
+
+# exceptions
+class NoResultsPending(Exception):
+    """All work requests have been processed."""
+    pass
+
+class NoWorkersAvailable(Exception):
+    """No worker threads available to process remaining requests."""
+    pass
+
+# classes
+class WorkerThread(threading.Thread):
+    """Background thread connected to the requests/results queues.
+
+    A worker thread sits in the background and picks up work requests from
+    one queue and puts the results in another until it is dismissed.
+    """
+
+    def __init__(self, requestsQueue, resultsQueue, **kwds):
+        """Set up thread in daemonic mode and start it immediatedly.
+
+        requestsQueue and resultQueue are instances of Queue.Queue passed
+        by the ThreadPool class when it creates a new worker thread.
+        """
+
+        threading.Thread.__init__(self, **kwds)
+        self.setDaemon(1)
+        self.workRequestQueue = requestsQueue
+        self.resultQueue = resultsQueue
+        self._dismissed = threading.Event()
+        self.start()
+
+    def run(self):
+        """Repeatedly process the job queue until told to exit."""
+
+        while not self._dismissed.isSet():
+            # thread blocks here, if queue empty
+            request = self.workRequestQueue.get()
+            if self._dismissed.isSet():
+                # if told to exit, return the work request we just picked up
+                self.workRequestQueue.put(request)
+                break # and exit
+            try:
+                self.resultQueue.put(
+                    (request, request.callable(*request.args, **request.kwds))
+                )
+            except:
+                request.exception = True
+                self.resultQueue.put((request, sys.exc_info()))
+
+    def dismiss(self):
+        """Sets a flag to tell the thread to exit when done with current job.
+        """
+
+        self._dismissed.set()
+
+
+class WorkRequest:
+    """A request to execute a callable for putting in the request queue later.
+
+    See the module function makeRequests() for the common case
+    where you want to build several WorkRequests for the same callable
+    but with different arguments for each call.
+    """
+
+    def __init__(self, callable, args=None, kwds=None, requestID=None,
+      callback=None, exc_callback=None):
+        """Create a work request for a callable and attach callbacks.
+
+        A work request consists of the a callable to be executed by a
+        worker thread, a list of positional arguments, a dictionary
+        of keyword arguments.
+
+        A callback function can be specified, that is called when the results
+        of the request are picked up from the result queue. It must accept
+        two arguments, the request object and the results of the callable,
+        in that order. If you want to pass additional information to the
+        callback, just stick it on the request object.
+
+        You can also give a callback for when an exception occurs. It should
+        also accept two arguments, the work request and a tuple with the
+        exception details as returned by sys.exc_info().
+
+        requestID, if given, must be hashable since it is used by the
+        ThreadPool object to store the results of that work request in a
+        dictionary. It defaults to the return value of id(self).
+        """
+
+        if requestID is None:
+            self.requestID = id(self)
+        else:
+            try:
+                hash(requestID)
+            except TypeError:
+                raise TypeError("requestID must be hashable.")
+            self.requestID = requestID
+        self.exception = False
+        self.callback = callback
+        self.exc_callback = exc_callback
+        self.callable = callable
+        self.args = args or []
+        self.kwds = kwds or {}
+
+
+class ThreadPool:
+    """A thread pool, distributing work requests and collecting results.
+
+    See the module doctring for more information.
+    """
+
+    def __init__(self, num_workers, q_size=0):
+        """Set up the thread pool and start num_workers worker threads.
+
+        num_workers is the number of worker threads to start initialy.
+        If q_size > 0 the size of the work request queue is limited and
+        the thread pool blocks when the queue is full and it tries to put
+        more work requests in it (see putRequest method).
+        """
+
+        self.requestsQueue = Queue.Queue(q_size)
+        self.resultsQueue = Queue.Queue()
+        self.workers = []
+        self.workRequests = {}
+        self.createWorkers(num_workers)
+
+    def createWorkers(self, num_workers):
+        """Add num_workers worker threads to the pool."""
+
+        for i in range(num_workers):
+            self.workers.append(WorkerThread(self.requestsQueue,
+              self.resultsQueue))
+
+    def dismissWorkers(self, num_workers):
+        """Tell num_workers worker threads to quit after their current task.
+        """
+
+        for i in range(min(num_workers, len(self.workers))):
+            worker = self.workers.pop()
+            worker.dismiss()
+
+    def putRequest(self, request, block=True, timeout=0):
+        """Put work request into work queue and save its id for later."""
+
+        assert isinstance(request, WorkRequest)
+        self.requestsQueue.put(request, block, timeout)
+        self.workRequests[request.requestID] = request
+
+    def poll(self, block=False):
+        """Process any new results in the queue."""
+
+        while True:
+            # still results pending?
+            if not self.workRequests:
+                raise NoResultsPending
+            # are there still workers to process remaining requests?
+            elif block and not self.workers:
+                raise NoWorkersAvailable
+            try:
+                # get back next results
+                request, result = self.resultsQueue.get(block=block)
+                # has an exception occured?
+                if request.exception and request.exc_callback:
+                    request.exc_callback(request, result)
+                # hand results to callback, if any
+                if request.callback and not \
+                  (request.exception and request.exc_callback):
+                    request.callback(request, result)
+                del self.workRequests[request.requestID]
+            except Queue.Empty:
+                break
+
+    def wait(self, sleep=0):
+        """Wait for results, blocking until all have arrived."""
+
+        while 1:
+            try:
+                self.poll(True)
+                time.sleep(sleep)
+            except NoResultsPending:
+                break
+
+# helper functions
+def makeRequests(callable, args_list, callback=None, exc_callback=None):
+    """Create several work requests for same callable with different arguments.
+
+    Convenience function for creating several work requests for the same
+    callable where each invocation of the callable receives different values
+    for its arguments.
+
+    args_list contains the parameters for each invocation of callable.
+    Each item in 'args_list' should be either a 2-item tuple of the list of
+    positional arguments and a dictionary of keyword arguments or a single,
+    non-tuple argument.
+
+    See docstring for WorkRequest for info on callback and exc_callback.
+    """
+
+    requests = []
+    for item in args_list:
+        if isinstance(item, tuple):
+            requests.append(
+              WorkRequest(callable, item[0], item[1], callback=callback,
+                exc_callback=exc_callback)
+            )
+        else:
+            requests.append(
+              WorkRequest(callable, [item], None, callback=callback,
+                exc_callback=exc_callback)
+            )
+    return requests
+
+################
+# USAGE EXAMPLE
+################
+
+if __name__ == '__main__':
+    import random
+    import time
+
+    # the work the threads will have to do (rather trivial in our example)
+    def do_something(data):
+        time.sleep(random.randint(1,5))
+        result = round(random.random() * data, 5)
+        # just to show off, we throw an exception once in a while
+        if result > 3:
+            raise RuntimeError("Something extraordinary happened!")
+        return result
+
+    # this will be called each time a result is available
+    def print_result(request, result):
+        print "**Result: %s from request #%s" % (result, request.requestID)
+
+    # this will be called when an exception occurs within a thread
+    def handle_exception(request, exc_info):
+        print "Exception occured in request #%s: %s" % \
+          (request.requestID, exc_info[1])
+
+    # assemble the arguments for each job to a list...
+    data = [random.randint(1,10) for i in range(20)]
+    # ... and build a WorkRequest object for each item in data
+    requests = makeRequests(do_something, data, print_result, handle_exception)
+
+    # or the other form of args_lists accepted by makeRequests: ((,), {})
+    data = [((random.randint(1,10),), {}) for i in range(20)]
+    requests.extend(
+      makeRequests(do_something, data, print_result, handle_exception)
+    )
+
+    # we create a pool of 3 worker threads
+    main = ThreadPool(3)
+
+    # then we put the work requests in the queue...
+    for req in requests:
+        main.putRequest(req)
+        print "Work request #%s added." % req.requestID
+    # or shorter:
+    # [main.putRequest(req) for req in requests]
+
+    # ...and wait for the results to arrive in the result queue
+    # by using ThreadPool.wait(). This would block until results for
+    # all work requests have arrived:
+    # main.wait()
+
+    # instead we can poll for results while doing something else:
+    i = 0
+    while 1:
+        try:
+            main.poll()
+            print "Main thread working..."
+            time.sleep(0.5)
+            if i == 10:
+                print "Adding 3 more worker threads..."
+                main.createWorkers(3)
+            i += 1
+        except KeyboardInterrupt:
+            print "Interrupted!"
+            break
+        except NoResultsPending:
+            print "All results collected."
+            break
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -17,4 +17,113 @@
 '''
 Contains the logic for parsing feeds.
 '''
+import time, logging
+from datetime import datetime

+from libprs500.web.feeds.feedparser import parse
+
+class Article(object):
+    
+    time_offset = datetime.now() - datetime.utcnow()
+
+    def __init__(self, id, title, url, summary, published, content):
+        self.id = id
+        self.title = title
+        self.url = url
+        self.summary = summary
+        self.content = content
+        self.date = published
+        self.utctime = datetime(*self.date[:6])
+        self.localtime = self.utctime + self.time_offset
+                
+    def __repr__(self):
+        return \
+(u'''\
+Title       : %s
+URL         : %s
+Summary     : %s
+Date        : %s
+Has content : %s
+'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
+     bool(self.content))).encode('utf-8')
+
+    def __str__(self):
+        return repr(self)
+
+
+class Feed(object):
+
+    def __init__(self):
+        '''
+        Parse a feed into articles.
+        '''
+        self.logger = logging.getLogger('feeds2disk')
+        
+    def populate_from_feed(self, feed, title=None, oldest_article=7, 
+                           max_articles_per_feed=100):
+        entries = feed.entries
+        feed = feed.feed
+        self.title        = feed.get('title', 'Unknown feed') if not title else title
+        self.description  = feed.get('description', '')
+        image             = feed.get('image', {})
+        self.image_url    = image.get('href', None)
+        self.image_width  = image.get('width', 88)
+        self.image_height = image.get('height', 31)
+        self.image_alt    = image.get('title', '')
+        
+        self.articles = []
+        self.id_counter = 0
+        self.added_articles = []
+        
+        self.oldest_article = oldest_article
+        
+        for item in entries:
+            if len(self.articles) > max_articles_per_feed:
+                break
+            self.parse_article(item)
+
+    def parse_article(self, item):
+        id = item.get('id', 'internal id#'+str(self.id_counter))
+        if id in self.added_articles:
+            return
+        published = item.get('date_parsed', time.gmtime())
+        self.id_counter += 1
+        self.added_articles.append(id)
+        
+        title = item.get('title', 'Untitled article')
+        link  = item.get('link',  None)
+        description = item.get('summary', None)
+        
+        content = '\n'.join(i.value for i in item.get('content', []))
+        if not content.strip():
+            content = None
+        
+        article = Article(id, title, link, description, published, content)
+        delta = datetime.utcnow() - article.utctime
+        if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
+            self.articles.append(article)
+        else:
+            self.logger.debug('Skipping article %s as it is too old.'%title)
+        
+    def __iter__(self):
+        return iter(self.articles)
+    
+    def __len__(self):
+        return len(self.articles)
+    
+    def __repr__(self):
+        res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
+        
+        return '\n'+'\n'.join(res)+'\n'
+    
+    def __str__(self):
+        return repr(self)
+
+
+def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
+    feed = parse(raw_xml)
+    pfeed = Feed()
+    pfeed.populate_from_feed(feed, title=title, 
+                            oldest_article=oldest_article,
+                            max_articles_per_feed=max_articles_per_feed)
+    return pfeed
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -17,7 +17,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''''''

-import sys, os
+import sys, os, logging
 from libprs500.web.recipes import get_feed, compile_recipe
 from libprs500.web.fetch.simple import option_parser as _option_parser

@ -53,26 +53,38 @@ If you specify this option, any argument to %prog is ignored and a default recip
    p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
    p.add_option('--recursions', default=0, type='int',
                 help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
+    p.add_option('--output-dir', default=os.getcwd(), 
+                 help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
+    p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
+                 help=_('Dont show the progress bar'))
+    p.add_option('--debug', action='store_true', default=False,
+                 help='Very verbose output, useful for debugging.')
    
    return p
    
-def simple_progress_bar(*args):
-    print '%d%%'%(args[0]*100),
+def simple_progress_bar(percent, msg):
+    print '%d%%'%(percent*100),
    sys.stdout.flush()
    
-def main(args=sys.argv, notification=None):
+def no_progress_bar(percent, msg):
+    print msg
+
+def main(args=sys.argv, notification=None, handler=None):
    p = option_parser()
    opts, args = p.parse_args(args)
    
    if notification is None:
        from libprs500.terminfo import TerminalController, ProgressBar
        term = TerminalController(sys.stdout)
-        try:
-            pb = ProgressBar(term, _('Fetching feeds...'))
-            notification = pb.update
-        except ValueError:
-            notification = simple_progress_bar
-            print _('Fetching feeds...')
+        if opts.progress_bar:
+            try:
+                pb = ProgressBar(term, _('Fetching feeds...'))
+                notification = pb.update
+            except ValueError:
+                notification = simple_progress_bar
+                print _('Fetching feeds...')
+        else:
+            notification = no_progress_bar
        
    if len(args) != 2:
        p.print_help()
@ -98,10 +110,14 @@ def main(args=sys.argv, notification=None):
        print args[1], 'is an invalid recipe'
        return 1
    
+    if handler is None:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
+        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+        logging.getLogger('feeds2disk').addHandler(handler)
+    
    recipe = recipe(opts, p, notification)
-    index  = recipe.download()
-     
-
+    recipe.download()
        
    return 0

--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -1,5 +1,4 @@
 #!/usr/bin/env  python
-
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
@ -18,9 +17,16 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging
+import logging, os, cStringIO, traceback, time
+import urlparse

 from libprs500 import browser
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.web.feeds import feed_from_xml, templates
+from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
+from libprs500.web.fetch.simple import RecursiveFetcher
+from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
+

 class BasicNewsRecipe(object):
    '''
@ -48,6 +54,10 @@ class BasicNewsRecipe(object):
    #: @type: integer
    delay                 = 0
    
+    #: Number of simultaneous downloads. Set to 1 if the server is picky.
+    #: @type: integer
+    simultaneous_downloads = 5
+    
    #: Timeout for fetching files from server in seconds
    #: @type: integer
    timeout               = 10
@ -55,7 +65,7 @@ class BasicNewsRecipe(object):
    #: The format string for the date shown on the first page
    #: By default: Day Name Day Number Month Name Year
    #: @type: string
-    timefmt               = ' [%a %d %b %Y]'
+    timefmt               = ' %a, %d %b %Y'
    
    #: Max number of characters in the short description.
    #: @type: integer
@ -94,6 +104,19 @@ class BasicNewsRecipe(object):
    #: @type: list of strings
    html2lrf_options   = []
    
+    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
+    #: A tag is specified as a dictionary of the form::
+    #:  {
+    #:     name      : 'tag name',   #e.g. 'div'
+    #:     attrs     : a dictionary, #e.g. {class: 'advertisment'}
+    #:  }
+    #: All keys are optional. For a full explanantion of the search criteria, see
+    #: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
+    #: A common example::
+    #:   remove_tags = [dict(name='div', attrs={'class':'advert'})]
+    #:   This will remove all <div class="advert"> tags and all their children from the downloaded HTML. 
+    remove_tags = []
+    
    #: List of regexp substitution rules to run on the downloaded HTML. Each element of the 
    #: list should be a two element tuple. The first element of the tuple should
    #: be a compiled regular expression and the second a callable that takes
@ -131,6 +154,25 @@ class BasicNewsRecipe(object):
        '''
        return browser()
    
+    def preprocess_html(self, soup):
+        '''
+        This function is called with the source of each downloaded HTML file. 
+        It can be used to do arbitrarily powerful pre-processing on the HTML.
+        @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} 
+                     instance containing the downloaded HTML.
+        @type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
+        @return: It must return soup (after having done any needed preprocessing)
+        @rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance 
+        '''
+        return soup
+    
+    def cleanup(self):
+        '''
+        Called after all articles have been download. Use it to do any cleanup like 
+        logging out of subscription sites, etc.
+        '''
+        pass
+    
    def __init__(self, options, parser, progress_reporter):
        '''
        Initialize the recipe.
@ -138,9 +180,15 @@ class BasicNewsRecipe(object):
        @param parser:  Command line option parser. Used to intelligently merge options.
        @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
        '''
-        for attr in ('username', 'password', 'lrf'):
+        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
            setattr(self, attr, getattr(options, attr))
+        self.output_dir = os.path.abspath(self.output_dir)
+        
        self.logger = logging.getLogger('feeds2disk')
+        
+        if self.debug:
+            self.logger.setLevel(logging.DEBUG)
+            self.verbose = True
        self.report_progress = progress_reporter
        
        self.username = self.password = None
@ -160,24 +208,201 @@ class BasicNewsRecipe(object):
        if self.needs_subscription and (self.username is None or self.password is None):
            raise ValueError('The %s recipe needs a username and password.'%self.title)
        
+        self.browser = self.get_browser()
+        self.image_map, self.image_counter = {}, 1
+        
+        web2disk_cmdline = [ 'web2disk', 
+            '--timeout', str(self.timeout),
+            '--max-recursions', str(self.recursions),
+            '--delay', str(self.delay),
+            '--timeout', str(self.timeout),            
+            ]
+        if self.encoding is not None:
+            web2disk_cmdline.extend(['--encoding', self.encoding])
+        
+        if self.verbose:
+            web2disk_cmdline.append('--verbose')
+            
+        if self.no_stylesheets:
+            web2disk_cmdline.append('--dont-download-stylesheets')
+            
+        for reg in self.match_regexps:
+            web2disk_cmdline.extend(['--match-regexp', reg])
+            
+        for reg in self.filter_regexps:
+            web2disk_cmdline.extend(['--filter-regexp', reg])
+            
+        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
+        self.web2disk_options.remove_tags = self.remove_tags
+        self.web2disk_options.preprocess_regexps = self.preprocess_regexps
+        self.web2disk_options.preprocess_html = self.preprocess_html
+        
+        if self.delay > 0:
+            self.simultaneous_downloads = 1
+            
+        self.navbar = templates.NavBarTemplate()
+            
    def download(self):
-        self.report_progress(0, 'Starting download...')
-        return self.build_index()
+        '''
+        Download and pre-process all articles from the feeds in this recipe. 
+        This method should be called only one on a particular Recipe instance.
+        Calling it more than once will lead to undefined behavior.
+        @return: Path to index.html
+        @rtype: string
+        '''
+        self.report_progress(0, _('Initialized'))
+        res = self.build_index()
+        self.cleanup()
+        return res
+    
+    def feeds2index(self, feeds):
+        templ = templates.IndexTemplate()
+        return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
+    
+    def feed2index(self, feed):
+        if feed.image_url is not None: # Download feed image
+            imgdir = os.path.join(self.output_dir, 'images')
+            if not os.path.isdir(imgdir):
+                os.makedirs(imgdir)
+        
+            if self.image_map.has_key(feed.image_url):
+                feed.image_url = self.image_map[feed.image_url]
+            else:
+                bn = urlparse.urlsplit(feed.image_url).path
+                if bn:
+                    bn = bn.rpartition('/')[-1]
+                    if bn:
+                        img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn)))
+                        open(img, 'wb').write(self.browser.open(feed.image_url).read())
+                        self.image_counter += 1
+                        feed.image_url = img
+                        self.image_map[feed.image_url] = img
+                
+        templ = templates.FeedTemplate()
+        return templ.generate(feed).render(doctype='xhtml')
+        
+    
+    def create_logger(self, feed_number, article_number):
+        logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
+        out = cStringIO.StringIO()
+        handler = logging.StreamHandler(out)
+        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+        handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
+        if self.debug:
+            handler.setLevel(logging.DEBUG)
+        logger.addHandler(handler)
+        return logger, out
+    
+    def fetch_article(self, url, dir, logger):
+        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
+        fetcher.base_dir = dir
+        fetcher.current_dir = dir
+        fetcher.show_progress = False
+        return fetcher.start_fetch(url)
    
    def build_index(self):
-        self.parse_feeds()
+        self.report_progress(0, _('Fetching feeds...'))
+        feeds = self.parse_feeds()
+        self.has_single_feed = len(feeds) == 1
+        
+        index = os.path.join(self.output_dir, 'index.html') 
+        
+        html = self.feeds2index(feeds)
+        open(index, 'wb').write(html)
+        
+        self.jobs = []
+        for f, feed in enumerate(feeds):
+            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+            if not os.path.isdir(feed_dir):
+                os.makedirs(feed_dir)
+                
+            for a, article in enumerate(feed):
+                art_dir = os.path.join(feed_dir, 'article_%d'%a)
+                if not os.path.isdir(art_dir):
+                    os.makedirs(art_dir)
+                logger, stream = self.create_logger(f, a)
+                try:
+                    url = self.print_version(article.url)
+                except NotImplementedError:
+                    url = article.url
+                req = WorkRequest(self.fetch_article, (url, art_dir, logger), 
+                                  {}, (f, a), self.article_downloaded, 
+                                  self.error_in_article_download)
+                req.stream = stream
+                req.feed = feed
+                req.article = article
+                self.jobs.append(req)
+                    
+        self.jobs_done = 0
+        tp = ThreadPool(self.simultaneous_downloads)
+        for req in self.jobs:
+            tp.putRequest(req, block=True, timeout=0)
+        
+        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
+        while True:
+            try:
+                tp.poll(True)
+                time.sleep(0.1)
+            except NoResultsPending:
+                break
+        
+        html = self.feed2index(feed)
+        open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
+        self.report_progress(1, _('Feeds downloaded to %s')%index)
+        return index
+            
+            
+    def article_downloaded(self, request, result):
+        index = os.path.join(os.path.dirname(result), 'index.html')
+        os.rename(result, index)
+        src = open(index, 'rb').read().decode('utf-8')
+        f, a = request.requestID
+        soup = BeautifulSoup(src)
+        body = soup.find('body')
+        if body is not None:
+            top    = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
+            bottom = self.navbar.generate(True,  f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
+            top    = BeautifulSoup(top).find('div')
+            bottom = BeautifulSoup(bottom).find('div')
+            body.insert(0, top)
+            body.insert(len(body.contents), bottom)
+            open(index, 'wb').write(unicode(soup).encode('utf-8'))
+        
+        article = request.article
+        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
+        article.url = result
+        article.downloaded = True
+        self.jobs_done += 1
+        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
+        
+    def error_in_article_download(self, request, exc_info):
+        self.jobs_done += 1
+        self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
+        self.logger.debug(traceback.format_exc(*exc_info))
+        self.logger.debug(request.stream.getvalue())
+        self.logger.debug('\n')
+        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
+        
        
    def parse_feeds(self):
        '''
-        Create list of articles from a list of feeds.
+        Create a list of articles from a list of feeds.
        @rtype: list
-        @return: A list whose items are 2-tuples C{('feed title', articles)}, 
-        where C{articles} is a list of dictionaries each of the form::
-            {
-            'title'       : article title,
-            'url'         : URL of print version,
-            'date'        : The publication date of the article as a string,
-            'description' : A summary of the article
-            'content'     : The full article (can be an empty string). This is used by FullContentProfile
-            }
+        @return: A list of L{Feed}s.
        '''
+        feeds = self.get_feeds()
+        parsed_feeds = []
+        for obj in feeds:
+            if isinstance(obj, basestring):
+                title, url = None, obj
+            else:
+                title, url = obj
+            self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
+            parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), 
+                                              title=title,
+                                              oldest_article=self.oldest_article,
+                                              max_articles_per_feed=self.max_articles_per_feed))
+            
+        return parsed_feeds
+    
+               
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -0,0 +1,162 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+from genshi.template import MarkupTemplate
+
+class Template(MarkupTemplate):
+    
+    STYLE = '''\
+            .article_date {
+                font-size: x-small; color: gray; font-family: monospace;
+            }
+            
+            .article_description {
+                font-size: small; font-family: sans; text-indent: 0pt;
+            }
+            
+            a.article {
+                font-weight: bold; font-size: large;
+            }
+            
+            a.feed {
+                font-weight: bold; font-size: large;
+            }
+'''
+    
+    def generate(self, *args, **kwargs):
+        if not kwargs.has_key('style'):
+            kwargs['style'] = self.STYLE
+        return MarkupTemplate.generate(self, *args, **kwargs)
+    
+class NavBarTemplate(Template):
+    
+    def __init__(self):
+        Template.__init__(self, '''\
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" 
+      xml:lang="en"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:py="http://genshi.edgewall.org/" 
+       
+>
+    <body>
+        <div class="navbar" style="text-align:center">
+            <hr py:if="bottom" />
+            <a href="../index.html#article_${str(art)}">Up one level</a> 
+            <py:if test="two_levels">
+            | <a href="../../index.html#_${str(feed)}">Up two levels</a>
+            </py:if>
+            <py:if test="art != 0">
+            | <a href="../article_${str(art-1)}/index.html">Previous</a>
+            </py:if>
+            <py:if test="art != num - 1">
+            | <a href="../article_${str(art+1)}/index.html">Next</a>
+            </py:if>
+            <hr py:if="not bottom" />
+        </div>
+    </body>
+</html>
+''')
+
+    def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
+        return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
+    
+
+class IndexTemplate(Template):
+    
+    def __init__(self):
+        Template.__init__(self, '''\
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" 
+      xml:lang="en"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:py="http://genshi.edgewall.org/" 
+       
+>
+    <head>
+        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+        <title>${title}</title>
+        <style type="text/css">
+            ${style}
+        </style>
+    </head>
+    <body>
+        <h1>${title}</h1>
+        <?python
+        from datetime import datetime
+        ?>
+        <p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
+        <ul>
+            <py:for each="i, feed in enumerate(feeds)">
+            <li id="feed_${str(i)}">
+                <a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
+            </li>
+            </py:for>
+        </ul>
+    </body>
+</html>
+''')
+
+    def generate(self, title, datefmt, feeds):
+        return Template.generate(self, title=title, datefmt=datefmt, feeds=feeds)
+    
+    
+class FeedTemplate(Template):
+    
+    def __init__(self):
+        Template.__init__(self, '''\
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" 
+      xml:lang="en"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:py="http://genshi.edgewall.org/" 
+       
+>
+    <head>
+        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+        <title>${feed.title}</title>
+        <style type="text/css">
+            ${style}
+        </style>
+    </head>
+    <body>
+        <h2>${feed.title}</h2>
+        <py:if test="feed.image">
+        <div class="feed_image">
+            <img alt="${feed.image_alt}" src="${feed.image_url}" />
+        </div>
+        </py:if>
+        <ul>
+            <py:for each="i, article in enumerate(feed)">
+            <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
+                <a class="article" href="${article.url}">${article.title}</a>
+                <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
+                <p class="article_decription" py:if="article.summary">
+                    ${Markup(article.summary)}
+                </p>
+            </li>
+            </py:for>
+        </ul>
+    </body>
+</html>
+''')
+        
+    def generate(self, feed):
+        return Template.generate(self, feed=feed)
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -77,14 +77,22 @@ class RecursiveFetcher(object):
        self.stylemap = {}
        self.current_dir = self.base_dir
        self.files = 0
-        self.preprocess_regexps = []
+        self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
+        self.remove_tags         = getattr(options, 'remove_tags', [])
+        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
        self.download_stylesheets = not options.no_stylesheets
+        self.show_progress = True
               

    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
-        return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
+        soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
+        for kwds in self.remove_tags:
+            for tag in soup.findAll(**kwds):
+                tag.extract()
+        return self.preprocess_html_ext(soup)
+        

    def fetch_url(self, url):
        f = None
@ -249,7 +257,9 @@ class RecursiveFetcher(object):
        try:
            self.current_dir = diskpath
            for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
-                print '.',
+                if self.show_progress:
+                    print '.',
+                    sys.stdout.flush()
                sys.stdout.flush()
                iurl = self.absurl(baseurl, tag, 'href')
                if not iurl:
@ -301,7 +311,8 @@ class RecursiveFetcher(object):
                    self.files += 1                
        finally:
            self.current_dir = prev_dir
-        print
+        if self.show_progress:
+            print
        return res
    
    def __del__(self):
@ -327,7 +338,6 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
    parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
                      help='Do not download CSS stylesheets.', dest='no_stylesheets')
-
    parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
                      default=False, action='store_true', dest='verbose')
    return parser