feeds2disk improved to the point of being able to download Newsweek. Added new Newsweek recipe.

2025-08-30 23:00:21 -04:00 · 2008-03-12 20:59:29 +00:00 · 2008-03-12 20:59:29 +00:00 · 2ccf260f7d
commit 2ccf260f7d
parent 756de168fe
15 changed files with 477 additions and 139 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -23,13 +23,14 @@ from gettext import GNUTranslations
 from math import floor
 from optparse import OptionParser as _OptionParser
 from optparse import IndentedHelpFormatter
+from logging import Formatter

 from ttfquery import findsystem, describe

 from libprs500.translations.msgfmt import make
 from libprs500.ebooks.chardet import detect
 from libprs500.terminfo import TerminalController
-terminal_controller = TerminalController()
+terminal_controller = TerminalController(sys.stdout)

 iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
 isosx     = 'darwin' in sys.platform.lower()
@ -51,6 +52,25 @@ __builtin__.__dict__['_'] = lambda s: s
 class CommandLineError(Exception):
    pass

+class ColoredFormatter(Formatter):
+    
+    def format(self, record):
+        ln = record.__dict__['levelname']
+        col = ''
+        if ln == 'CRITICAL':
+            col = terminal_controller.YELLOW
+        elif ln == 'ERROR':
+            col = terminal_controller.RED
+        elif ln in ['WARN', 'WARNING']:
+            col = terminal_controller.BLUE
+        elif ln == 'INFO':
+            col = terminal_controller.GREEN
+        elif ln == 'DEBUG':
+            col = terminal_controller.CYAN
+        record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
+        return Formatter.format(self, record)
+         
+
 def setup_cli_handlers(logger, level):
    logger.setLevel(level)
    if level == logging.WARNING:
@ -187,9 +207,9 @@ def extract(path, dir):
        raise Exception('Unknown archive type')
    extractor(path, dir)

-def browser():
+def browser(honor_time=False):
    opener = mechanize.Browser()
-    opener.set_handle_refresh(True)
+    opener.set_handle_refresh(True, honor_time=honor_time)
    opener.set_handle_robots(False)
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
    return opener
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -147,7 +147,7 @@ class Delegator(object):
            d.parent = self
            methods = d.getMethods()
            self.delegatedMethods += methods
-            for m in methods:                
+            for m in methods:
                setattr(self, m, getattr(d, m))

            """
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -595,6 +595,11 @@ class OPFCreator(OPF):
            self.uid = mi.uid    
        
    def create_manifest(self, entries):
+        '''
+        Create <manifest>
+        @param entries: List of (URL, mime-type)
+        @type entries: list of 2-tuples
+        '''
        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
        package = doc.documentElement
        manifest = doc.createElement('manifest')
@ -616,6 +621,11 @@ class OPFCreator(OPF):
            
            
    def create_spine(self, entries):
+        '''
+        Create the <spine> element. Must first call L{create_manifest}.
+        @param: List of paths
+        @type param: list of strings
+        '''
        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
        package = doc.documentElement
        spine = doc.createElement('spine')
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@ -74,6 +74,34 @@ def options(option_parser):
        opts.extend(opt._long_opts)
    return opts

+def opts_and_words(name, op, words):
+    opts  = ' '.join(options(op))
+    words = [repr(w) for w in words]
+    words = ' '.join(words) 
+    return '_'+name+'()'+\
+'''
+{
+    local cur prev opts
+    COMPREPLY=()
+    cur="${COMP_WORDS[COMP_CWORD]}"
+    opts="%s"
+    words="%s"
+    
+    case "${cur}" in
+      -* )
+         COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
+         return 0
+         ;;      
+      *  )
+         COMPREPLY=( $(compgen -W "${words}" -- ${cur}) )
+         return 0
+         ;;
+    esac
+
+}
+complete -F _'''%(opts, words) + name + ' ' + name +"\n\n"
+
+
 def opts_and_exts(name, op, exts):
    opts = ' '.join(options(op))
    exts.extend([i.upper() for i in exts])
@ -135,6 +163,8 @@ def setup_completion(fatal_errors):
        from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop
        from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
        from libprs500.ebooks.mobi.reader import option_parser as mobioeb
+        from libprs500.web.feeds.main import option_parser as feeds2disk
+        from libprs500.web.feeds.recipes import titles as feed_titles
        
        f = open_file('/etc/bash_completion.d/libprs500')
        
@ -159,6 +189,7 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
        f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
        f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
+        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write('''
 _prs500_ls()
 {
--- a/src/libprs500/parallel.py
+++ b/src/libprs500/parallel.py
@ -15,7 +15,7 @@
 '''
 Used to run jobs in parallel in separate processes.
 '''
-import re, sys, tempfile, os, cPickle, cStringIO, traceback, atexit, binascii, time, subprocess
+import re, sys, tempfile, os, cPickle, traceback, atexit, binascii, time, subprocess
 from functools import partial


--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@ -106,7 +106,7 @@ class TerminalController:
        except: return
        
        # If the stream isn't a tty, then assume it has no capabilities.
-        if not term_stream.isatty(): return
+        if not hasattr(term_stream, 'isatty') or not term_stream.isatty(): return
        
        # Check the terminal type.  If we fail, then assume that the
        # terminal has no capabilities.
--- a/src/libprs500/threadpool.py
+++ b/src/libprs500/threadpool.py
@ -97,7 +97,8 @@ class WorkerThread(threading.Thread):
                )
            except:
                request.exception = True
-                self.resultQueue.put((request, sys.exc_info()))
+                import traceback
+                self.resultQueue.put((request, traceback.format_exc()))

    def dismiss(self):
        """Sets a flag to tell the thread to exit when done with current job.
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -27,6 +27,7 @@ class Article(object):
    time_offset = datetime.now() - datetime.utcnow()

    def __init__(self, id, title, url, summary, published, content):
+        self.downloaded = False
        self.id = id
        self.title = title
        self.url = url
@ -103,7 +104,7 @@ class Feed(object):
        if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
            self.articles.append(article)
        else:
-            self.logger.debug('Skipping article %s as it is too old.'%title)
+            self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
        
    def __iter__(self):
        return iter(self.articles)
@ -118,6 +119,12 @@ class Feed(object):
    
    def __str__(self):
        return repr(self)
+    
+    def __bool__(self):
+        for article in self:
+            if getattr(article, 'downloaded', False):
+                return True
+        return False


 def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -18,21 +18,24 @@ from libprs500.web.feeds.news import BasicNewsRecipe
 ''''''

 import sys, os, logging
-from libprs500.web.recipes import get_feed, compile_recipe
+from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
 from libprs500.web.fetch.simple import option_parser as _option_parser


 def option_parser(usage='''\
-%prog [options] ARG
+%%prog [options] ARG

-%prog parsers an online source of articles, like an RSS or ATOM feed and 
+%%prog parsers an online source of articles, like an RSS or ATOM feed and 
 fetches the article contents organized in a nice hierarchy.

 ARG can be one of:
-file name            - %prog will try to load a recipe from the file
-builtin recipe title - %prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
-recipe as a string   - $prog will load the recipe directly from the string arg.
-'''):
+file name            - %%prog will try to load a recipe from the file
+builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
+recipe as a string   - %%prog will load the recipe directly from the string arg.
+
+Available builtin recipes are:
+%s
+'''%(unicode(list(titles))[1:-1])):
    p = _option_parser(usage=usage)
    p.remove_option('--max-recursions')
    p.remove_option('--base-dir')
@ -86,7 +89,7 @@ def main(args=sys.argv, notification=None, handler=None):
        else:
            notification = no_progress_bar
        
-    if len(args) != 2:
+    if len(args) != 2 and opts.feeds is None:
        p.print_help()
        return 1
    
@ -96,11 +99,16 @@ def main(args=sys.argv, notification=None, handler=None):
    else:
        try:
            if os.access(args[1], os.R_OK):
-                recipe = compile_recipe(open(args[1]).read())
+                try:
+                    recipe = compile_recipe(open(args[1]).read())
+                except:
+                    import traceback
+                    traceback.print_exc()
+                    return 1
            else:
-                raise Exception('')
+                raise Exception('not file')
        except:
-            recipe = get_feed(args[1])
+            recipe = get_builtin_recipe(args[1])
            if recipe is None:
                recipe = compile_recipe(args[1])
    
@ -111,9 +119,10 @@ def main(args=sys.argv, notification=None, handler=None):
        return 1
    
    if handler is None:
+        from libprs500 import ColoredFormatter
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
-        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+        handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
        logging.getLogger('feeds2disk').addHandler(handler)
    
    recipe = recipe(opts, p, notification)
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -17,11 +17,13 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, traceback, time
+import logging, os, cStringIO, time, itertools, traceback
 import urlparse

-from libprs500 import browser
+from libprs500 import browser, __appname__
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.metadata.opf import OPFCreator
+from libprs500.ebooks.metadata import MetaInformation
 from libprs500.web.feeds import feed_from_xml, templates
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
 from libprs500.web.fetch.simple import RecursiveFetcher
@ -35,7 +37,10 @@ class BasicNewsRecipe(object):
    
    #: The title to use for the ebook
    #: @type: string    
-    title                 = 'Unknown News Source'    
+    title                 = _('Unknown News Source')
+    
+    #: The author of this recipe
+    __author__            = _('Unknown')    
    
    #: Maximum number of articles to download from each feed
    #: @type: integer
@ -55,17 +60,18 @@ class BasicNewsRecipe(object):
    delay                 = 0
    
    #: Number of simultaneous downloads. Set to 1 if the server is picky.
+    #: Automatically reduced to 1 if L{delay} > 0
    #: @type: integer
    simultaneous_downloads = 5
    
    #: Timeout for fetching files from server in seconds
    #: @type: integer
-    timeout               = 10
+    timeout               = 120
    
    #: The format string for the date shown on the first page
    #: By default: Day Name Day Number Month Name Year
    #: @type: string
-    timefmt               = ' %a, %d %b %Y'
+    timefmt               = ' [%a, %d %b %Y]'
    
    #: Max number of characters in the short description.
    #: @type: integer
@ -102,7 +108,7 @@ class BasicNewsRecipe(object):
    
    #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
    #: @type: list of strings
-    html2lrf_options   = []
+    html2lrf_options   = ['--page-break-before', '$']
    
    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
@ -114,9 +120,23 @@ class BasicNewsRecipe(object):
    #: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
    #: A common example::
    #:   remove_tags = [dict(name='div', attrs={'class':'advert'})]
-    #:   This will remove all <div class="advert"> tags and all their children from the downloaded HTML. 
+    #:   This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
+    #: @type: list 
    remove_tags = []
    
+    #: Remove all tags that occur after the specified tag. 
+    #: For the format for specifying a tag see L{remove_tags}.
+    #: For example, C{remove_tags_after = [dict(id='content')]} will remove all
+    #: tags after the element with id C{content}.
+    remove_tags_after = None
+    
+    #: Keep only the specified tags and their children. 
+    #: For the format for specifying tags see L{remove_tags}.
+    #: If this list is not empty, then the <body> element will be emptied and re-filled with
+    #: the tags that match the entries in this list.
+    #: @type: list 
+    keep_only_tags = []
+    
    #: List of regexp substitution rules to run on the downloaded HTML. Each element of the 
    #: list should be a two element tuple. The first element of the tuple should
    #: be a compiled regular expression and the second a callable that takes
@ -126,6 +146,13 @@ class BasicNewsRecipe(object):
    
    # See the built-in profiles for examples of these settings.
    
+    def get_cover_url(self):
+        '''
+        Return a URL to the cover image for this issue or None.
+        @rtype: string or None
+        '''
+        return getattr(self, 'cover_url', None)
+    
    def get_feeds(self):
        '''
        Return a list of RSS feeds to fetch for this profile. Each element of the list
@ -156,7 +183,21 @@ class BasicNewsRecipe(object):
    
    def preprocess_html(self, soup):
        '''
-        This function is called with the source of each downloaded HTML file. 
+        This function is called with the source of each downloaded HTML file, before
+        it is parsed for links and images. 
+        It can be used to do arbitrarily powerful pre-processing on the HTML.
+        @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} 
+                     instance containing the downloaded HTML.
+        @type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
+        @return: It must return soup (after having done any needed preprocessing)
+        @rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance 
+        '''
+        return soup
+    
+    def postprocess_html(self, soup):
+        '''
+        This function is called with the source of each downloaded HTML file, after
+        it is parsed for links and images. 
        It can be used to do arbitrarily powerful pre-processing on the HTML.
        @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} 
                     instance containing the downloaded HTML.
@ -210,6 +251,7 @@ class BasicNewsRecipe(object):
        
        self.browser = self.get_browser()
        self.image_map, self.image_counter = {}, 1
+        self.css_map = {}
        
        web2disk_cmdline = [ 'web2disk', 
            '--timeout', str(self.timeout),
@ -233,14 +275,18 @@ class BasicNewsRecipe(object):
            web2disk_cmdline.extend(['--filter-regexp', reg])
            
        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
-        self.web2disk_options.remove_tags = self.remove_tags
-        self.web2disk_options.preprocess_regexps = self.preprocess_regexps
-        self.web2disk_options.preprocess_html = self.preprocess_html
+        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
+                      'preprocess_html', 'remove_tags_after', 'postprocess_html'):
+            setattr(self.web2disk_options, extra, getattr(self, extra))
        
        if self.delay > 0:
            self.simultaneous_downloads = 1
            
        self.navbar = templates.NavBarTemplate()
+        self.max_articles_per_feed -= 1
+        self.html2lrf_options.append('--use-spine')
+        self.failed_downloads = []
+        self.partial_failures = []
            
    def download(self):
        '''
@ -250,9 +296,26 @@ class BasicNewsRecipe(object):
        @return: Path to index.html
        @rtype: string
        '''
-        self.report_progress(0, _('Initialized'))
+        self.report_progress(0, _('Trying to download cover...'))
+        self.download_cover()
        res = self.build_index()
        self.cleanup()
+        self.report_progress(1, _('Download finished'))
+        if self.failed_downloads:
+            self.logger.warning(_('Failed to download the following articles:'))
+            for feed, article, debug in self.failed_downloads:
+                self.logger.warning(article.title+_(' from ')+feed.title)
+                self.logger.debug(article.url)
+                self.logger.debug(debug)
+        if self.partial_failures:
+            self.logger.warning(_('Failed to download parts of the following articles:'))
+            for feed, atitle, aurl, debug in self.partial_failures:
+                self.logger.warning(atitle + _(' from ') + feed)
+                self.logger.debug(aurl)
+                self.logger.warning(_('\tFailed links:'))
+                for l, tb in debug:
+                    self.logger.warning(l)
+                    self.logger.debug(tb) 
        return res
    
    def feeds2index(self, feeds):
@ -294,11 +357,14 @@ class BasicNewsRecipe(object):
        return logger, out
    
    def fetch_article(self, url, dir, logger):
-        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
+        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
-        return fetcher.start_fetch(url)
+        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
+        if not res:
+            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
+        return res, path, failures
    
    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
@ -331,58 +397,111 @@ class BasicNewsRecipe(object):
                req.stream = stream
                req.feed = feed
                req.article = article
+                req.feed_dir = feed_dir
                self.jobs.append(req)
+            
                    
        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)
        
+        
        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
-                tp.poll(True)
+                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break
        
-        html = self.feed2index(feed)
-        open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
+        for f, feed in enumerate(feeds):
+            html = self.feed2index(feed)
+            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
+            open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
+        
+        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)
        return index
+    
+    def download_cover(self):
+        self.cover_path = None
+        try:
+            cu = self.get_cover_url()
+        except Exception, err:
+            cu = None
+            self.logger.error(_('Could not download cover: %s')%str(err))
+            self.logger.debug(traceback.format_exc())
+        if cu is not None:
+            ext = cu.rpartition('.')[-1]
+            ext = ext.lower() if ext else 'jpg'
+            self.report_progress(1, _('Downloading cover from %s')%cu)
+            cpath = os.path.join(self.output_dir, 'cover.'+ext)
+            cfile = open(cpath, 'wb')
+            cfile.write(self.browser.open(cu).read())
+            self.cover_path = cpath
            
-            
+    
+    def create_opf(self, feeds, dir=None):
+        if dir is None:
+            dir = self.output_dir
+        mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
+        opf = OPFCreator(mi)
+        opf_path = os.path.join(dir, 'index.opf')
+        
+        cpath = getattr(self, 'cover_path', None) 
+        if cpath is not None and os.access(cpath, os.R_OK):
+            opf.cover = cpath
+        
+        entries = ['index.html']
+        for i, f in enumerate(feeds):
+            entries.append('feed_%d/index.html'%i)
+            for j, a in enumerate(f):
+                if getattr(a, 'downloaded', False):
+                    adir = 'feed_%d/article_%d/'%(i, j)
+                    entries.append('%sindex.html'%adir)
+                    for sp in a.sub_pages:
+                        prefix = os.path.commonprefix([opf_path, sp])
+                        relp = sp[len(prefix):]
+                        entries.append(relp.replace(os.sep, '/'))
+                        
+        opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
+        opf.create_spine(entries)
+        opf.write(open(opf_path, 'wb'))
+        
+    
    def article_downloaded(self, request, result):
-        index = os.path.join(os.path.dirname(result), 'index.html')
-        os.rename(result, index)
+        index = os.path.join(os.path.dirname(result[0]), 'index.html')
+        os.rename(result[0], index)
        src = open(index, 'rb').read().decode('utf-8')
        f, a = request.requestID
        soup = BeautifulSoup(src)
        body = soup.find('body')
        if body is not None:
            top    = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
-            bottom = self.navbar.generate(True,  f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
            top    = BeautifulSoup(top).find('div')
-            bottom = BeautifulSoup(bottom).find('div')
            body.insert(0, top)
-            body.insert(len(body.contents), bottom)
            open(index, 'wb').write(unicode(soup).encode('utf-8'))
        
        article = request.article
-        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
-        article.url = result
+        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
+        article.url = result[0]
        article.downloaded = True
+        article.sub_pages  = result[1][1:]
        self.jobs_done += 1
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
+        if result[2]:
+            self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
        
-    def error_in_article_download(self, request, exc_info):
+    def error_in_article_download(self, request, traceback):
        self.jobs_done += 1
-        self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
-        self.logger.debug(traceback.format_exc(*exc_info))
-        self.logger.debug(request.stream.getvalue())
+        self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
+        debug = request.stream.getvalue().decode('utf-8', 'ignore')
+        self.logger.debug(debug)
+        self.logger.debug(traceback)
        self.logger.debug('\n')
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
-        
+        self.failed_downloads.append((request.feed.title, request.article, debug))
        
    def parse_feeds(self):
        '''
@ -404,5 +523,3 @@ class BasicNewsRecipe(object):
                                              max_articles_per_feed=self.max_articles_per_feed))
            
        return parsed_feeds
-    
-               
--- a/src/libprs500/web/feeds/recipes/init.py
+++ b/src/libprs500/web/feeds/recipes/init.py
@ -0,0 +1,79 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Builtin recipes.
+'''
+recipes = ['newsweek']
+
+import re
+from libprs500.web.feeds.news import BasicNewsRecipe
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
+from libprs500.ebooks.lrf.web import available_profiles
+
+basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
+basic_recipe_names = (i.__name__ for i in basic_recipes)
+
+
+#: Compiled builtin recipe/profile classes
+def load_recipe(module, package='libprs500.web.feeds.recipes'):
+    module = __import__(package+'.'+module, fromlist=[''])
+    for attr in dir(module):
+        obj = getattr(module, attr)
+        if type(obj) is not type:
+            continue
+        recipe = False
+        for b in obj.__bases__:
+            if b in basic_recipes:
+                recipe = True
+                break
+        if not recipe:
+            continue
+        if obj not in basic_recipes:
+            return obj
+
+
+recipes = [load_recipe(i) for i in recipes]
+    
+def compile_recipe(src):
+    '''
+    Compile the code in src and return the first object that is a recipe or profile.
+    @return: Recipe/Profile class or None, if no such class was found in C{src} 
+    '''
+    locals = {}
+    exec src in globals(), locals
+    for obj in locals.values():
+        if type(obj) is type and obj.__name__ not in basic_recipe_names:
+            for base in obj.__bases__:
+                if base in basic_recipes:
+                    return obj
+    
+    return None
+
+
+def get_builtin_recipe(title):
+    '''
+    Return a builtin recipe/profile class whoose title == C{title} or None if no such
+    recipe exists.
+    
+    @type title: string
+    @rtype: class or None
+    '''
+    for r in recipes:
+        if r.title == title:
+            return r
+
+titles = set([r.title for r in recipes])
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -0,0 +1,90 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+import re
+from libprs500.web.feeds.news import BasicNewsRecipe
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class Newsweek(BasicNewsRecipe):
+
+    title      = 'Newsweek'
+    __author__ = 'Kovid Goyal'
+    
+    feeds = [
+             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
+             'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
+             ('Politics', 'http://feeds.newsweek.com/headlines/politics'),
+             ('Health', 'http://feeds.newsweek.com/headlines/health'),
+             ('Business', 'http://feeds.newsweek.com/headlines/business'),
+             ('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
+             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
+             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
+             'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
+             'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria', 
+             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
+             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
+             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
+             'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill', 
+             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
+             ]
+    # For testing
+    feeds = feeds[:2]
+    max_articles_per_feed = 1
+    
+    keep_only_tags = [dict(name='div', id='content')]
+
+    remove_tags = [
+        dict(name=['script',  'noscript']),
+        dict(name='div',  attrs={'class':['ad', 'SocialLinks', 'SocialLinksDiv', 'channel', 'bot', 'nav', 'top', 'EmailArticleBlock']}),
+        dict(name='div',  attrs={'class':re.compile('box')}),
+        dict(id=['ToolBox', 'EmailMain', 'EmailArticle', ])
+    ]
+    
+    recursions = 1
+    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
+    
+    def postprocess_html(self,  soup):
+        divs = list(soup.findAll('div', 'pagination'))
+        divs[0].extract()
+        if len(divs) > 1:
+            soup.find('body')['style'] = 'page-break-after:avoid'
+            divs[1].extract()            
+            
+            h1 = soup.find('h1')
+            if h1:
+                h1.extract()
+            ai = soup.find('div', 'articleInfo')
+            ai.extract()
+        else:
+            soup.find('body')['style'] = 'page-break-before:always; page-break-after:avoid;'
+        return soup
+    
+    def get_current_issue(self):
+        from urllib2 import urlopen # For some reason mechanize fails
+        home = urlopen('http://www.newsweek.com').read() 
+        soup = BeautifulSoup(home)
+        img  = soup.find('img', alt='Current Magazine')
+        if img and img.parent.has_key('href'):
+            return urlopen(img.parent['href']).read()
+        
+    def get_cover_url(self):
+        ci = self.get_current_issue()
+        if ci is not None:
+            soup = BeautifulSoup(ci)
+            img = soup.find(alt='Cover')
+            if img is not None and img.has_key('src'):
+                small = img['src']
+                return small.replace('coversmall', 'coverlarge')
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -104,7 +104,7 @@ class IndexTemplate(Template):
        <p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
        <ul>
            <py:for each="i, feed in enumerate(feeds)">
-            <li id="feed_${str(i)}">
+            <li py:if="feed" id="feed_${str(i)}">
                <a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
            </li>
            </py:for>
@ -136,7 +136,7 @@ class FeedTemplate(Template):
            ${style}
        </style>
    </head>
-    <body>
+    <body style="page-break-before:always">
        <h2>${feed.title}</h2>
        <py:if test="feed.image">
        <div class="feed_image">
@ -144,7 +144,7 @@ class FeedTemplate(Template):
        </div>
        </py:if>
        <ul>
-            <py:for each="i, article in enumerate(feed)">
+            <py:for each="i, article in enumerate(feed.articles)">
            <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
                <a class="article" href="${article.url}">${article.title}</a>
                <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -17,12 +17,12 @@ Fetch a webpage and its links recursively. The webpages are saved to disk in
 UTF-8 encoding with any charset declarations removed.
 '''
 from __future__ import with_statement
-import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading
+import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname
 from httplib import responses

 from libprs500 import setup_cli_handlers, browser, sanitize_file_name, OptionParser
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Tag
 from libprs500.ebooks.chardet import xml_to_unicode

 class FetchError(Exception):
@ -37,10 +37,11 @@ def basename(url):
    return res

 def save_soup(soup, target):
-    for meta in soup.findAll('meta', content=True):
+    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    for meta in soup.find('meta', content=True):
        if 'charset' in meta['content']:
-            meta.extract()
-    f = codecs.open(target, 'w', 'utf8')
+            meta.replaceWith(nm)
+    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))
    f.close()

@ -55,7 +56,7 @@ class RecursiveFetcher(object):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
    
-    def __init__(self, options, logger, image_map={}):
+    def __init__(self, options, logger, image_map={}, css_map={}):
        self.logger = logger
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
        if not os.path.exists(self.base_dir):
@ -74,20 +75,44 @@ class RecursiveFetcher(object):
        self.filemap = {}
        self.imagemap = image_map
        self.imagemap_lock = threading.RLock()
-        self.stylemap = {}
+        self.stylemap = css_map
+        self.stylemap_lock = threading.RLock()
+        self.downloaded_paths = []
        self.current_dir = self.base_dir
        self.files = 0
        self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
        self.remove_tags         = getattr(options, 'remove_tags', [])
+        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
+        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
+        self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
+        self.failed_links = []
               

    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
+         
+        if self.keep_only_tags:
+            body = Tag(soup, 'body')
+            for spec in self.keep_only_tags:
+                for tag in soup.find('body').findAll(**spec):
+                    body.insert(len(body.contents), tag)
+            soup.find('body').replaceWith(body)
+            
+        if self.remove_tags_after is not None:
+            tag = soup.find(**self.remove_tags_after)
+            while tag is not None and tag.name != 'body':
+                after = tag.nextSibling
+                while after is not None:
+                    ns = after.nextSibling
+                    after.extract()
+                    after = ns
+                tag = tag.parent
+            
        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
@ -105,7 +130,12 @@ class RecursiveFetcher(object):
        except urllib2.URLError, err:
            if hasattr(err, 'code') and responses.has_key(err.code):
                raise FetchError, responses[err.code]
-            raise err
+            if err.reason[0] == 104: # Connection reset by peer
+                self.logger.debug('Connection reset by peer retrying in 1 second.')
+                time.sleep(1)
+                f = self.browser.open(url)
+            else: 
+                raise err
        finally:
            self.last_fetch_at = time.time()
        return f
@ -146,9 +176,10 @@ class RecursiveFetcher(object):
                iurl = tag['href']
                if not urlparse.urlsplit(iurl).scheme:
                    iurl = urlparse.urljoin(baseurl, iurl, False)
-                if self.stylemap.has_key(iurl):
-                    tag['href'] = self.stylemap[iurl]
-                    continue
+                with self.stylemap_lock:
+                    if self.stylemap.has_key(iurl):
+                        tag['href'] = self.stylemap[iurl]
+                        continue
                try:
                    f = self.fetch_url(iurl)
                except Exception, err:
@ -157,7 +188,8 @@ class RecursiveFetcher(object):
                    continue
                c += 1
                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
-                self.stylemap[iurl] = stylepath
+                with self.stylemap_lock:
+                    self.stylemap[iurl] = stylepath
                open(stylepath, 'wb').write(f.read())
                tag['href'] = stylepath
            else:
@ -168,9 +200,10 @@ class RecursiveFetcher(object):
                        iurl = m.group(1)
                        if not urlparse.urlsplit(iurl).scheme:
                            iurl = urlparse.urljoin(baseurl, iurl, False)
-                        if self.stylemap.has_key(iurl):
-                            ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
-                            continue
+                        with self.stylemap_lock:
+                            if self.stylemap.has_key(iurl):
+                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
+                                continue
                        try:
                            f = self.fetch_url(iurl)
                        except Exception, err:
@ -179,7 +212,8 @@ class RecursiveFetcher(object):
                            continue
                        c += 1
                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
-                        self.stylemap[iurl] = stylepath
+                        with self.stylemap_lock:
+                            self.stylemap[iurl] = stylepath
                        open(stylepath, 'wb').write(f.read())
                        ns.replaceWith(src.replace(m.group(1), stylepath))
                        
@ -214,7 +248,7 @@ class RecursiveFetcher(object):
            open(imgpath, 'wb').write(f.read())
            tag['src'] = imgpath

-    def absurl(self, baseurl, tag, key): 
+    def absurl(self, baseurl, tag, key, filter=True): 
        iurl = tag[key]
        parts = urlparse.urlsplit(iurl)
        if not parts.netloc and not parts.path:
@ -224,7 +258,7 @@ class RecursiveFetcher(object):
        if not self.is_link_ok(iurl):
            self.logger.debug('Skipping invalid link: %s', iurl)
            return None
-        if not self.is_link_wanted(iurl):
+        if filter and not self.is_link_wanted(iurl):
            self.logger.debug('Filtered link: '+iurl)
            return None
        return iurl
@ -256,12 +290,12 @@ class RecursiveFetcher(object):
        prev_dir = self.current_dir
        try:
            self.current_dir = diskpath
-            for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
+            for tag in soup.findAll('a', href=True):
                if self.show_progress:
                    print '.',
                    sys.stdout.flush()
                sys.stdout.flush()
-                iurl = self.absurl(baseurl, tag, 'href')
+                iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
@ -293,6 +327,7 @@ class RecursiveFetcher(object):
                        self.process_stylesheets(soup, f.geturl())
                    
                    res = os.path.join(linkdiskpath, basename(iurl))
+                    self.downloaded_paths.append(res)
                    self.filemap[nurl] = res
                    if recursion_level < self.max_recursions:
                        self.logger.debug('Processing links...')
@ -301,9 +336,11 @@ class RecursiveFetcher(object):
                        self.process_return_links(soup, iurl) 
                        self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
                    
-                    save_soup(soup, res)
+                    save_soup(self.postprocess_html_ext(soup), res)
+                    
                    self.localize_link(tag, 'href', res)
                except Exception, err:
+                    self.failed_links.append((iurl, traceback.format_exc()))
                    self.logger.warning('Could not fetch link %s', iurl)
                    self.logger.debug('Error: %s', str(err), exc_info=True)
                finally:
--- a/src/libprs500/web/recipes.py
+++ b/src/libprs500/web/recipes.py
@ -1,63 +0,0 @@
-#!/usr/bin/env  python
-##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
-##    This program is free software; you can redistribute it and/or modify
-##    it under the terms of the GNU General Public License as published by
-##    the Free Software Foundation; either version 2 of the License, or
-##    (at your option) any later version.
-##
-##    This program is distributed in the hope that it will be useful,
-##    but WITHOUT ANY WARRANTY; without even the implied warranty of
-##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##    GNU General Public License for more details.
-##
-##    You should have received a copy of the GNU General Public License along
-##    with this program; if not, write to the Free Software Foundation, Inc.,
-##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-'''
-Contains recipes for various common news sources and websites.
-'''
-import re
-from libprs500.web.feeds.news import BasicNewsRecipe
-
-_basic_recipes = (BasicNewsRecipe,)
-_basic_recipe_names = (i.__name__ for i in _basic_recipes)
-
-def compile_recipe(src):
-    '''
-    Compile the code in src and return the first object that is
-    '''
-    locals = {}
-    exec src in globals(), locals
-    for obj in locals.values():
-        if type(obj) is type and obj.__name__ not in _basic_recipe_names:
-            for base in obj.__bases__:
-                if base in _basic_recipes:
-                    return obj
-    
-    return None
-
-
-def get_feed(title):
-    '''
-    Return a builtin recipe class whoose title == C{title} or None if no such
-    recipe exists.
-    
-    @type title: string
-    @rtype: class or None
-    '''
-    if isinstance(_feeds[0], basestring):
-        for i, val in enumerate(_feeds):
-            recipe = compile_recipe(val)
-            if recipe is None:
-                raise RuntimeError('The builtin Recipe #%d is invalid.'%i)
-            _feeds[i] = recipe
-    
-    for recipe in _feeds:
-        if recipe.title == title:
-            return recipe
-        
-    return None
-    
-
-#: Recipes to be used with feeds2disk
-_feeds = ['class Temp(BasicNewsRecipe):\n\ttitle="temp"']