Initial implementation of feeds2disk

2025-07-07 18:24:30 -04:00 · 2008-03-11 17:06:54 +00:00 · 2008-03-11 17:06:54 +00:00 · ae28c0a164
commit ae28c0a164
parent 829267da44
8 changed files with 893 additions and 66 deletions
--- a/src/libprs500/manual/templates/basic.html
+++ b/src/libprs500/manual/templates/basic.html
@ -1,29 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
    "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
    <head>
        <meta name="author" content="Kovid Goyal" />
        <meta name="copyright" content="&copy; 2008 Kovid Goyal" />
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
        <title></title>
        <link rel="stylesheet" type="text/css" href="styles/common.css" />
    </head>
    <body>
        %body
        <hr />
        <div class="footer">
            <p>
                <a href="http://validator.w3.org/check?uri=referer">
                    <img src="images/valid.png" alt="Valid XHTML 1.1" height="31" width="88" />
                </a><br />
                Created by Kovid Goyal &copy; 2008
            </p>
        </div>
    </body>
 </html>
--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@ -190,14 +190,17 @@ class ProgressBar:
        self.cleared = 1 #: true if we haven't drawn the bar yet.
    def update(self, percent, message=''):
        if isinstance(message, unicode):
            message = message.encode('utf-8', 'ignore')
        if self.cleared:
            sys.stdout.write(self.header)
            self.cleared = 0
        n = int((self.width-10)*percent)
        msg = message.center(self.width)
        sys.stdout.write(
        self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
        (self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
-        self.term.CLEAR_EOL + message.center(self.width))
+        self.term.CLEAR_EOL + msg)
    def clear(self):
        if not self.cleared:
--- a/src/libprs500/threadpool.py
+++ b/src/libprs500/threadpool.py
@ -0,0 +1,331 @@
 """Easy to use object-oriented thread pool framework.
 A thread pool is an object that maintains a pool of worker threads to perform
 time consuming operations in parallel. It assigns jobs to the threads
 by putting them in a work request queue, where they are picked up by the
 next available thread. This then performs the requested operation in the
 background and puts the results in a another queue.
 The thread pool object can then collect the results from all threads from
 this queue as soon as they become available or after all threads have
 finished their work. It's also possible, to define callbacks to handle
 each result as it comes in.
 The basic concept and some code was taken from the book "Python in a Nutshell"
 by Alex Martelli, copyright 2003, ISBN 0-596-00188-6, from section 14.5
 "Threaded Program Architecture". I wrapped the main program logic in the
 ThreadPool class, added the WorkRequest class and the callback system and
 tweaked the code here and there. Kudos also to Florent Aide for the exception
 handling mechanism.
 Basic usage:
 >>> pool = TreadPool(poolsize)
 >>> requests = makeRequests(some_callable, list_of_args, callback)
 >>> [pool.putRequest(req) for req in requests]
 >>> pool.wait()
 See the end of the module code for a brief, annotated usage example.
 Website : http://chrisarndt.de/en/software/python/threadpool/
 """
 __all__ = [
  'makeRequests',
  'NoResultsPending',
  'NoWorkersAvailable',
  'ThreadPool',
  'WorkRequest',
  'WorkerThread'
 ]
 __author__ = "Christopher Arndt"
 __version__ = "1.2.3"
 __revision__ = "$Revision: 1.5 $"
 __date__ = "$Date: 2006/06/23 12:32:25 $"
 __license__ = 'Python license'
 # standard library modules
 import sys
 import threading
 import Queue
 # exceptions
 class NoResultsPending(Exception):
    """All work requests have been processed."""
    pass
 class NoWorkersAvailable(Exception):
    """No worker threads available to process remaining requests."""
    pass
 # classes
 class WorkerThread(threading.Thread):
    """Background thread connected to the requests/results queues.
    A worker thread sits in the background and picks up work requests from
    one queue and puts the results in another until it is dismissed.
    """
    def __init__(self, requestsQueue, resultsQueue, **kwds):
        """Set up thread in daemonic mode and start it immediatedly.
        requestsQueue and resultQueue are instances of Queue.Queue passed
        by the ThreadPool class when it creates a new worker thread.
        """
        threading.Thread.__init__(self, **kwds)
        self.setDaemon(1)
        self.workRequestQueue = requestsQueue
        self.resultQueue = resultsQueue
        self._dismissed = threading.Event()
        self.start()
    def run(self):
        """Repeatedly process the job queue until told to exit."""
        while not self._dismissed.isSet():
            # thread blocks here, if queue empty
            request = self.workRequestQueue.get()
            if self._dismissed.isSet():
                # if told to exit, return the work request we just picked up
                self.workRequestQueue.put(request)
                break # and exit
            try:
                self.resultQueue.put(
                    (request, request.callable(*request.args, **request.kwds))
                )
            except:
                request.exception = True
                self.resultQueue.put((request, sys.exc_info()))
    def dismiss(self):
        """Sets a flag to tell the thread to exit when done with current job.
        """
        self._dismissed.set()
 class WorkRequest:
    """A request to execute a callable for putting in the request queue later.
    See the module function makeRequests() for the common case
    where you want to build several WorkRequests for the same callable
    but with different arguments for each call.
    """
    def __init__(self, callable, args=None, kwds=None, requestID=None,
      callback=None, exc_callback=None):
        """Create a work request for a callable and attach callbacks.
        A work request consists of the a callable to be executed by a
        worker thread, a list of positional arguments, a dictionary
        of keyword arguments.
        A callback function can be specified, that is called when the results
        of the request are picked up from the result queue. It must accept
        two arguments, the request object and the results of the callable,
        in that order. If you want to pass additional information to the
        callback, just stick it on the request object.
        You can also give a callback for when an exception occurs. It should
        also accept two arguments, the work request and a tuple with the
        exception details as returned by sys.exc_info().
        requestID, if given, must be hashable since it is used by the
        ThreadPool object to store the results of that work request in a
        dictionary. It defaults to the return value of id(self).
        """
        if requestID is None:
            self.requestID = id(self)
        else:
            try:
                hash(requestID)
            except TypeError:
                raise TypeError("requestID must be hashable.")
            self.requestID = requestID
        self.exception = False
        self.callback = callback
        self.exc_callback = exc_callback
        self.callable = callable
        self.args = args or []
        self.kwds = kwds or {}
 class ThreadPool:
    """A thread pool, distributing work requests and collecting results.
    See the module doctring for more information.
    """
    def __init__(self, num_workers, q_size=0):
        """Set up the thread pool and start num_workers worker threads.
        num_workers is the number of worker threads to start initialy.
        If q_size > 0 the size of the work request queue is limited and
        the thread pool blocks when the queue is full and it tries to put
        more work requests in it (see putRequest method).
        """
        self.requestsQueue = Queue.Queue(q_size)
        self.resultsQueue = Queue.Queue()
        self.workers = []
        self.workRequests = {}
        self.createWorkers(num_workers)
    def createWorkers(self, num_workers):
        """Add num_workers worker threads to the pool."""
        for i in range(num_workers):
            self.workers.append(WorkerThread(self.requestsQueue,
              self.resultsQueue))
    def dismissWorkers(self, num_workers):
        """Tell num_workers worker threads to quit after their current task.
        """
        for i in range(min(num_workers, len(self.workers))):
            worker = self.workers.pop()
            worker.dismiss()
    def putRequest(self, request, block=True, timeout=0):
        """Put work request into work queue and save its id for later."""
        assert isinstance(request, WorkRequest)
        self.requestsQueue.put(request, block, timeout)
        self.workRequests[request.requestID] = request
    def poll(self, block=False):
        """Process any new results in the queue."""
        while True:
            # still results pending?
            if not self.workRequests:
                raise NoResultsPending
            # are there still workers to process remaining requests?
            elif block and not self.workers:
                raise NoWorkersAvailable
            try:
                # get back next results
                request, result = self.resultsQueue.get(block=block)
                # has an exception occured?
                if request.exception and request.exc_callback:
                    request.exc_callback(request, result)
                # hand results to callback, if any
                if request.callback and not \
                  (request.exception and request.exc_callback):
                    request.callback(request, result)
                del self.workRequests[request.requestID]
            except Queue.Empty:
                break
    def wait(self, sleep=0):
        """Wait for results, blocking until all have arrived."""
        while 1:
            try:
                self.poll(True)
                time.sleep(sleep)
            except NoResultsPending:
                break
 # helper functions
 def makeRequests(callable, args_list, callback=None, exc_callback=None):
    """Create several work requests for same callable with different arguments.
    Convenience function for creating several work requests for the same
    callable where each invocation of the callable receives different values
    for its arguments.
    args_list contains the parameters for each invocation of callable.
    Each item in 'args_list' should be either a 2-item tuple of the list of
    positional arguments and a dictionary of keyword arguments or a single,
    non-tuple argument.
    See docstring for WorkRequest for info on callback and exc_callback.
    """
    requests = []
    for item in args_list:
        if isinstance(item, tuple):
            requests.append(
              WorkRequest(callable, item[0], item[1], callback=callback,
                exc_callback=exc_callback)
            )
        else:
            requests.append(
              WorkRequest(callable, [item], None, callback=callback,
                exc_callback=exc_callback)
            )
    return requests
 ################
 # USAGE EXAMPLE
 ################
 if __name__ == '__main__':
    import random
    import time
    # the work the threads will have to do (rather trivial in our example)
    def do_something(data):
        time.sleep(random.randint(1,5))
        result = round(random.random() * data, 5)
        # just to show off, we throw an exception once in a while
        if result > 3:
            raise RuntimeError("Something extraordinary happened!")
        return result
    # this will be called each time a result is available
    def print_result(request, result):
        print "**Result: %s from request #%s" % (result, request.requestID)
    # this will be called when an exception occurs within a thread
    def handle_exception(request, exc_info):
        print "Exception occured in request #%s: %s" % \
          (request.requestID, exc_info[1])
    # assemble the arguments for each job to a list...
    data = [random.randint(1,10) for i in range(20)]
    # ... and build a WorkRequest object for each item in data
    requests = makeRequests(do_something, data, print_result, handle_exception)
    # or the other form of args_lists accepted by makeRequests: ((,), {})
    data = [((random.randint(1,10),), {}) for i in range(20)]
    requests.extend(
      makeRequests(do_something, data, print_result, handle_exception)
    )
    # we create a pool of 3 worker threads
    main = ThreadPool(3)
    # then we put the work requests in the queue...
    for req in requests:
        main.putRequest(req)
        print "Work request #%s added." % req.requestID
    # or shorter:
    # [main.putRequest(req) for req in requests]
    # ...and wait for the results to arrive in the result queue
    # by using ThreadPool.wait(). This would block until results for
    # all work requests have arrived:
    # main.wait()
    # instead we can poll for results while doing something else:
    i = 0
    while 1:
        try:
            main.poll()
            print "Main thread working..."
            time.sleep(0.5)
            if i == 10:
                print "Adding 3 more worker threads..."
                main.createWorkers(3)
            i += 1
        except KeyboardInterrupt:
            print "Interrupted!"
            break
        except NoResultsPending:
            print "All results collected."
            break
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -17,4 +17,113 @@
 '''
 Contains the logic for parsing feeds.
 '''
 import time, logging
 from datetime import datetime
 from libprs500.web.feeds.feedparser import parse
 class Article(object):
    time_offset = datetime.now() - datetime.utcnow()
    def __init__(self, id, title, url, summary, published, content):
        self.id = id
        self.title = title
        self.url = url
        self.summary = summary
        self.content = content
        self.date = published
        self.utctime = datetime(*self.date[:6])
        self.localtime = self.utctime + self.time_offset
    def __repr__(self):
        return \
 (u'''\
 Title       : %s
 URL         : %s
 Summary     : %s
 Date        : %s
 Has content : %s
 '''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
     bool(self.content))).encode('utf-8')
    def __str__(self):
        return repr(self)
 class Feed(object):
    def __init__(self):
        '''
        Parse a feed into articles.
        '''
        self.logger = logging.getLogger('feeds2disk')
    def populate_from_feed(self, feed, title=None, oldest_article=7, 
                           max_articles_per_feed=100):
        entries = feed.entries
        feed = feed.feed
        self.title        = feed.get('title', 'Unknown feed') if not title else title
        self.description  = feed.get('description', '')
        image             = feed.get('image', {})
        self.image_url    = image.get('href', None)
        self.image_width  = image.get('width', 88)
        self.image_height = image.get('height', 31)
        self.image_alt    = image.get('title', '')
        self.articles = []
        self.id_counter = 0
        self.added_articles = []
        self.oldest_article = oldest_article
        for item in entries:
            if len(self.articles) > max_articles_per_feed:
                break
            self.parse_article(item)
    def parse_article(self, item):
        id = item.get('id', 'internal id#'+str(self.id_counter))
        if id in self.added_articles:
            return
        published = item.get('date_parsed', time.gmtime())
        self.id_counter += 1
        self.added_articles.append(id)
        title = item.get('title', 'Untitled article')
        link  = item.get('link',  None)
        description = item.get('summary', None)
        content = '\n'.join(i.value for i in item.get('content', []))
        if not content.strip():
            content = None
        article = Article(id, title, link, description, published, content)
        delta = datetime.utcnow() - article.utctime
        if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
            self.articles.append(article)
        else:
            self.logger.debug('Skipping article %s as it is too old.'%title)
    def __iter__(self):
        return iter(self.articles)
    def __len__(self):
        return len(self.articles)
    def __repr__(self):
        res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
        return '\n'+'\n'.join(res)+'\n'
    def __str__(self):
        return repr(self)
 def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
    feed = parse(raw_xml)
    pfeed = Feed()
    pfeed.populate_from_feed(feed, title=title, 
                            oldest_article=oldest_article,
                            max_articles_per_feed=max_articles_per_feed)
    return pfeed
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -17,7 +17,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''''''
-import sys, os
+import sys, os, logging
 from libprs500.web.recipes import get_feed, compile_recipe
 from libprs500.web.fetch.simple import option_parser as _option_parser
@ -53,26 +53,38 @@ If you specify this option, any argument to %prog is ignored and a default recip
    p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
    p.add_option('--recursions', default=0, type='int',
                 help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
    p.add_option('--output-dir', default=os.getcwd(), 
                 help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
    p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
                 help=_('Dont show the progress bar'))
    p.add_option('--debug', action='store_true', default=False,
                 help='Very verbose output, useful for debugging.')
    return p
-def simple_progress_bar(*args):
+def simple_progress_bar(percent, msg):
-    print '%d%%'%(args[0]*100),
+    print '%d%%'%(percent*100),
    sys.stdout.flush()
-def main(args=sys.argv, notification=None):
+def no_progress_bar(percent, msg):
    print msg
 def main(args=sys.argv, notification=None, handler=None):
    p = option_parser()
    opts, args = p.parse_args(args)
    if notification is None:
        from libprs500.terminfo import TerminalController, ProgressBar
        term = TerminalController(sys.stdout)
        if opts.progress_bar:
            try:
                pb = ProgressBar(term, _('Fetching feeds...'))
                notification = pb.update
            except ValueError:
                notification = simple_progress_bar
                print _('Fetching feeds...')
        else:
            notification = no_progress_bar
    if len(args) != 2:
        p.print_help()
@ -98,10 +110,14 @@ def main(args=sys.argv, notification=None):
        print args[1], 'is an invalid recipe'
        return 1
    if handler is None:
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
        logging.getLogger('feeds2disk').addHandler(handler)
    recipe = recipe(opts, p, notification)
-    index  = recipe.download()
+    recipe.download()
    return 0
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -1,5 +1,4 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
@ -18,9 +17,16 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging
+import logging, os, cStringIO, traceback, time
 import urlparse
 from libprs500 import browser
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 from libprs500.web.feeds import feed_from_xml, templates
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
 from libprs500.web.fetch.simple import RecursiveFetcher
 from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
 class BasicNewsRecipe(object):
    '''
@ -48,6 +54,10 @@ class BasicNewsRecipe(object):
    #: @type: integer
    delay                 = 0
    #: Number of simultaneous downloads. Set to 1 if the server is picky.
    #: @type: integer
    simultaneous_downloads = 5
    #: Timeout for fetching files from server in seconds
    #: @type: integer
    timeout               = 10
@ -55,7 +65,7 @@ class BasicNewsRecipe(object):
    #: The format string for the date shown on the first page
    #: By default: Day Name Day Number Month Name Year
    #: @type: string
-    timefmt               = ' [%a %d %b %Y]'
+    timefmt               = ' %a, %d %b %Y'
    #: Max number of characters in the short description.
    #: @type: integer
@ -94,6 +104,19 @@ class BasicNewsRecipe(object):
    #: @type: list of strings
    html2lrf_options   = []
    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
    #:  {
    #:     name      : 'tag name',   #e.g. 'div'
    #:     attrs     : a dictionary, #e.g. {class: 'advertisment'}
    #:  }
    #: All keys are optional. For a full explanantion of the search criteria, see
    #: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
    #: A common example::
    #:   remove_tags = [dict(name='div', attrs={'class':'advert'})]
    #:   This will remove all <div class="advert"> tags and all their children from the downloaded HTML. 
    remove_tags = []
    #: List of regexp substitution rules to run on the downloaded HTML. Each element of the 
    #: list should be a two element tuple. The first element of the tuple should
    #: be a compiled regular expression and the second a callable that takes
@ -131,6 +154,25 @@ class BasicNewsRecipe(object):
        '''
        return browser()
    def preprocess_html(self, soup):
        '''
        This function is called with the source of each downloaded HTML file. 
        It can be used to do arbitrarily powerful pre-processing on the HTML.
        @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} 
                     instance containing the downloaded HTML.
        @type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
        @return: It must return soup (after having done any needed preprocessing)
        @rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance 
        '''
        return soup
    def cleanup(self):
        '''
        Called after all articles have been download. Use it to do any cleanup like 
        logging out of subscription sites, etc.
        '''
        pass
    def __init__(self, options, parser, progress_reporter):
        '''
        Initialize the recipe.
@ -138,9 +180,15 @@ class BasicNewsRecipe(object):
        @param parser:  Command line option parser. Used to intelligently merge options.
        @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
        '''
-        for attr in ('username', 'password', 'lrf'):
+        for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
            setattr(self, attr, getattr(options, attr))
        self.output_dir = os.path.abspath(self.output_dir)
        self.logger = logging.getLogger('feeds2disk')
        if self.debug:
            self.logger.setLevel(logging.DEBUG)
            self.verbose = True
        self.report_progress = progress_reporter
        self.username = self.password = None
@ -160,24 +208,201 @@ class BasicNewsRecipe(object):
        if self.needs_subscription and (self.username is None or self.password is None):
            raise ValueError('The %s recipe needs a username and password.'%self.title)
        self.browser = self.get_browser()
        self.image_map, self.image_counter = {}, 1
        web2disk_cmdline = [ 'web2disk', 
            '--timeout', str(self.timeout),
            '--max-recursions', str(self.recursions),
            '--delay', str(self.delay),
            '--timeout', str(self.timeout),            
            ]
        if self.encoding is not None:
            web2disk_cmdline.extend(['--encoding', self.encoding])
        if self.verbose:
            web2disk_cmdline.append('--verbose')
        if self.no_stylesheets:
            web2disk_cmdline.append('--dont-download-stylesheets')
        for reg in self.match_regexps:
            web2disk_cmdline.extend(['--match-regexp', reg])
        for reg in self.filter_regexps:
            web2disk_cmdline.extend(['--filter-regexp', reg])
        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
        self.web2disk_options.remove_tags = self.remove_tags
        self.web2disk_options.preprocess_regexps = self.preprocess_regexps
        self.web2disk_options.preprocess_html = self.preprocess_html
        if self.delay > 0:
            self.simultaneous_downloads = 1
        self.navbar = templates.NavBarTemplate()
    def download(self):
-        self.report_progress(0, 'Starting download...')
+        '''
-        return self.build_index()
+        Download and pre-process all articles from the feeds in this recipe. 
        This method should be called only one on a particular Recipe instance.
        Calling it more than once will lead to undefined behavior.
        @return: Path to index.html
        @rtype: string
        '''
        self.report_progress(0, _('Initialized'))
        res = self.build_index()
        self.cleanup()
        return res
    def feeds2index(self, feeds):
        templ = templates.IndexTemplate()
        return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
    def feed2index(self, feed):
        if feed.image_url is not None: # Download feed image
            imgdir = os.path.join(self.output_dir, 'images')
            if not os.path.isdir(imgdir):
                os.makedirs(imgdir)
            if self.image_map.has_key(feed.image_url):
                feed.image_url = self.image_map[feed.image_url]
            else:
                bn = urlparse.urlsplit(feed.image_url).path
                if bn:
                    bn = bn.rpartition('/')[-1]
                    if bn:
                        img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn)))
                        open(img, 'wb').write(self.browser.open(feed.image_url).read())
                        self.image_counter += 1
                        feed.image_url = img
                        self.image_map[feed.image_url] = img
        templ = templates.FeedTemplate()
        return templ.generate(feed).render(doctype='xhtml')
    def create_logger(self, feed_number, article_number):
        logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
        out = cStringIO.StringIO()
        handler = logging.StreamHandler(out)
        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
        handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
        if self.debug:
            handler.setLevel(logging.DEBUG)
        logger.addHandler(handler)
        return logger, out
    def fetch_article(self, url, dir, logger):
        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
        return fetcher.start_fetch(url)
    def build_index(self):
-        self.parse_feeds()
+        self.report_progress(0, _('Fetching feeds...'))
        feeds = self.parse_feeds()
        self.has_single_feed = len(feeds) == 1
        index = os.path.join(self.output_dir, 'index.html') 
        html = self.feeds2index(feeds)
        open(index, 'wb').write(html)
        self.jobs = []
        for f, feed in enumerate(feeds):
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            if not os.path.isdir(feed_dir):
                os.makedirs(feed_dir)
            for a, article in enumerate(feed):
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
                logger, stream = self.create_logger(f, a)
                try:
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
                req = WorkRequest(self.fetch_article, (url, art_dir, logger), 
                                  {}, (f, a), self.article_downloaded, 
                                  self.error_in_article_download)
                req.stream = stream
                req.feed = feed
                req.article = article
                self.jobs.append(req)
        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)
        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
                tp.poll(True)
                time.sleep(0.1)
            except NoResultsPending:
                break
        html = self.feed2index(feed)
        open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
        self.report_progress(1, _('Feeds downloaded to %s')%index)
        return index
    def article_downloaded(self, request, result):
        index = os.path.join(os.path.dirname(result), 'index.html')
        os.rename(result, index)
        src = open(index, 'rb').read().decode('utf-8')
        f, a = request.requestID
        soup = BeautifulSoup(src)
        body = soup.find('body')
        if body is not None:
            top    = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
            bottom = self.navbar.generate(True,  f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
            top    = BeautifulSoup(top).find('div')
            bottom = BeautifulSoup(bottom).find('div')
            body.insert(0, top)
            body.insert(len(body.contents), bottom)
            open(index, 'wb').write(unicode(soup).encode('utf-8'))
        article = request.article
        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
        article.url = result
        article.downloaded = True
        self.jobs_done += 1
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
    def error_in_article_download(self, request, exc_info):
        self.jobs_done += 1
        self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
        self.logger.debug(traceback.format_exc(*exc_info))
        self.logger.debug(request.stream.getvalue())
        self.logger.debug('\n')
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
    def parse_feeds(self):
        '''
-        Create list of articles from a list of feeds.
+        Create a list of articles from a list of feeds.
        @rtype: list
-        @return: A list whose items are 2-tuples C{('feed title', articles)}, 
+        @return: A list of L{Feed}s.
        where C{articles} is a list of dictionaries each of the form::
            {
            'title'       : article title,
            'url'         : URL of print version,
            'date'        : The publication date of the article as a string,
            'description' : A summary of the article
            'content'     : The full article (can be an empty string). This is used by FullContentProfile
            }
        '''
        feeds = self.get_feeds()
        parsed_feeds = []
        for obj in feeds:
            if isinstance(obj, basestring):
                title, url = None, obj
            else:
                title, url = obj
            self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
            parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), 
                                              title=title,
                                              oldest_article=self.oldest_article,
                                              max_articles_per_feed=self.max_articles_per_feed))
        return parsed_feeds
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -0,0 +1,162 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 from genshi.template import MarkupTemplate
 class Template(MarkupTemplate):
    STYLE = '''\
            .article_date {
                font-size: x-small; color: gray; font-family: monospace;
            }
            .article_description {
                font-size: small; font-family: sans; text-indent: 0pt;
            }
            a.article {
                font-weight: bold; font-size: large;
            }
            a.feed {
                font-weight: bold; font-size: large;
            }
 '''
    def generate(self, *args, **kwargs):
        if not kwargs.has_key('style'):
            kwargs['style'] = self.STYLE
        return MarkupTemplate.generate(self, *args, **kwargs)
 class NavBarTemplate(Template):
    def __init__(self):
        Template.__init__(self, '''\
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" 
      xml:lang="en"
      xmlns:xi="http://www.w3.org/2001/XInclude"
      xmlns:py="http://genshi.edgewall.org/" 
 >
    <body>
        <div class="navbar" style="text-align:center">
            <hr py:if="bottom" />
            <a href="../index.html#article_${str(art)}">Up one level</a> 
            <py:if test="two_levels">
            | <a href="../../index.html#_${str(feed)}">Up two levels</a>
            </py:if>
            <py:if test="art != 0">
            | <a href="../article_${str(art-1)}/index.html">Previous</a>
            </py:if>
            <py:if test="art != num - 1">
            | <a href="../article_${str(art+1)}/index.html">Next</a>
            </py:if>
            <hr py:if="not bottom" />
        </div>
    </body>
 </html>
 ''')
    def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
        return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
 class IndexTemplate(Template):
    def __init__(self):
        Template.__init__(self, '''\
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" 
      xml:lang="en"
      xmlns:xi="http://www.w3.org/2001/XInclude"
      xmlns:py="http://genshi.edgewall.org/" 
 >
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
        <title>${title}</title>
        <style type="text/css">
            ${style}
        </style>
    </head>
    <body>
        <h1>${title}</h1>
        <?python
        from datetime import datetime
        ?>
        <p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
        <ul>
            <py:for each="i, feed in enumerate(feeds)">
            <li id="feed_${str(i)}">
                <a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
            </li>
            </py:for>
        </ul>
    </body>
 </html>
 ''')
    def generate(self, title, datefmt, feeds):
        return Template.generate(self, title=title, datefmt=datefmt, feeds=feeds)
 class FeedTemplate(Template):
    def __init__(self):
        Template.__init__(self, '''\
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" 
      xml:lang="en"
      xmlns:xi="http://www.w3.org/2001/XInclude"
      xmlns:py="http://genshi.edgewall.org/" 
 >
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
        <title>${feed.title}</title>
        <style type="text/css">
            ${style}
        </style>
    </head>
    <body>
        <h2>${feed.title}</h2>
        <py:if test="feed.image">
        <div class="feed_image">
            <img alt="${feed.image_alt}" src="${feed.image_url}" />
        </div>
        </py:if>
        <ul>
            <py:for each="i, article in enumerate(feed)">
            <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
                <a class="article" href="${article.url}">${article.title}</a>
                <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
                <p class="article_decription" py:if="article.summary">
                    ${Markup(article.summary)}
                </p>
            </li>
            </py:for>
        </ul>
    </body>
 </html>
 ''')
    def generate(self, feed):
        return Template.generate(self, feed=feed)
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -77,14 +77,22 @@ class RecursiveFetcher(object):
        self.stylemap = {}
        self.current_dir = self.base_dir
        self.files = 0
-        self.preprocess_regexps = []
+        self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
        self.remove_tags         = getattr(options, 'remove_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
-        return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
+        soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
    def fetch_url(self, url):
        f = None
@ -249,8 +257,10 @@ class RecursiveFetcher(object):
        try:
            self.current_dir = diskpath
            for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
                if self.show_progress:
                    print '.',
                    sys.stdout.flush()
                sys.stdout.flush()
                iurl = self.absurl(baseurl, tag, 'href')
                if not iurl:
                    continue
@ -301,6 +311,7 @@ class RecursiveFetcher(object):
                    self.files += 1                
        finally:
            self.current_dir = prev_dir
        if self.show_progress:
            print
        return res
@ -327,7 +338,6 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
    parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
                      help='Do not download CSS stylesheets.', dest='no_stylesheets')
    parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
                      default=False, action='store_true', dest='verbose')
    return parser