feeds2disk improved to the point of being able to download Newsweek. Added new Newsweek recipe.

2025-08-30 23:00:21 -04:00 · 2008-03-12 20:59:29 +00:00 · 2008-03-12 20:59:29 +00:00 · 2ccf260f7d
commit 2ccf260f7d
parent 756de168fe
15 changed files with 477 additions and 139 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -23,13 +23,14 @@ from gettext import GNUTranslations
 from math import floor
 from optparse import OptionParser as _OptionParser
 from optparse import IndentedHelpFormatter
 from logging import Formatter
 from ttfquery import findsystem, describe
 from libprs500.translations.msgfmt import make
 from libprs500.ebooks.chardet import detect
 from libprs500.terminfo import TerminalController
-terminal_controller = TerminalController()
+terminal_controller = TerminalController(sys.stdout)
 iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
 isosx     = 'darwin' in sys.platform.lower()
@ -51,6 +52,25 @@ __builtin__.__dict__['_'] = lambda s: s
 class CommandLineError(Exception):
    pass
 class ColoredFormatter(Formatter):
    def format(self, record):
        ln = record.__dict__['levelname']
        col = ''
        if ln == 'CRITICAL':
            col = terminal_controller.YELLOW
        elif ln == 'ERROR':
            col = terminal_controller.RED
        elif ln in ['WARN', 'WARNING']:
            col = terminal_controller.BLUE
        elif ln == 'INFO':
            col = terminal_controller.GREEN
        elif ln == 'DEBUG':
            col = terminal_controller.CYAN
        record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
        return Formatter.format(self, record)
 def setup_cli_handlers(logger, level):
    logger.setLevel(level)
    if level == logging.WARNING:
@ -187,9 +207,9 @@ def extract(path, dir):
        raise Exception('Unknown archive type')
    extractor(path, dir)
-def browser():
+def browser(honor_time=False):
    opener = mechanize.Browser()
-    opener.set_handle_refresh(True)
+    opener.set_handle_refresh(True, honor_time=honor_time)
    opener.set_handle_robots(False)
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
    return opener
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -147,7 +147,7 @@ class Delegator(object):
            d.parent = self
            methods = d.getMethods()
            self.delegatedMethods += methods
-            for m in methods:                
+            for m in methods:
                setattr(self, m, getattr(d, m))
            """
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -595,6 +595,11 @@ class OPFCreator(OPF):
            self.uid = mi.uid    
    def create_manifest(self, entries):
        '''
        Create <manifest>
        @param entries: List of (URL, mime-type)
        @type entries: list of 2-tuples
        '''
        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
        package = doc.documentElement
        manifest = doc.createElement('manifest')
@ -616,6 +621,11 @@ class OPFCreator(OPF):
    def create_spine(self, entries):
        '''
        Create the <spine> element. Must first call L{create_manifest}.
        @param: List of paths
        @type param: list of strings
        '''
        doc = dom.parseString(self.soup.__str__('UTF-8').strip())
        package = doc.documentElement
        spine = doc.createElement('spine')
--- a/src/libprs500/linux.py
+++ b/src/libprs500/linux.py
@ -74,6 +74,34 @@ def options(option_parser):
        opts.extend(opt._long_opts)
    return opts
 def opts_and_words(name, op, words):
    opts  = ' '.join(options(op))
    words = [repr(w) for w in words]
    words = ' '.join(words) 
    return '_'+name+'()'+\
 '''
 {
    local cur prev opts
    COMPREPLY=()
    cur="${COMP_WORDS[COMP_CWORD]}"
    opts="%s"
    words="%s"
    case "${cur}" in
      -* )
         COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
         return 0
         ;;      
      *  )
         COMPREPLY=( $(compgen -W "${words}" -- ${cur}) )
         return 0
         ;;
    esac
 }
 complete -F _'''%(opts, words) + name + ' ' + name +"\n\n"
 def opts_and_exts(name, op, exts):
    opts = ' '.join(options(op))
    exts.extend([i.upper() for i in exts])
@ -135,6 +163,8 @@ def setup_completion(fatal_errors):
        from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop
        from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
        from libprs500.ebooks.mobi.reader import option_parser as mobioeb
        from libprs500.web.feeds.main import option_parser as feeds2disk
        from libprs500.web.feeds.recipes import titles as feed_titles
        f = open_file('/etc/bash_completion.d/libprs500')
@ -159,6 +189,7 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
        f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
        f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write('''
 _prs500_ls()
 {
--- a/src/libprs500/parallel.py
+++ b/src/libprs500/parallel.py
@ -15,7 +15,7 @@
 '''
 Used to run jobs in parallel in separate processes.
 '''
-import re, sys, tempfile, os, cPickle, cStringIO, traceback, atexit, binascii, time, subprocess
+import re, sys, tempfile, os, cPickle, traceback, atexit, binascii, time, subprocess
 from functools import partial
--- a/src/libprs500/terminfo.py
+++ b/src/libprs500/terminfo.py
@ -106,7 +106,7 @@ class TerminalController:
        except: return
        # If the stream isn't a tty, then assume it has no capabilities.
-        if not term_stream.isatty(): return
+        if not hasattr(term_stream, 'isatty') or not term_stream.isatty(): return
        # Check the terminal type.  If we fail, then assume that the
        # terminal has no capabilities.
--- a/src/libprs500/threadpool.py
+++ b/src/libprs500/threadpool.py
@ -97,7 +97,8 @@ class WorkerThread(threading.Thread):
                )
            except:
                request.exception = True
-                self.resultQueue.put((request, sys.exc_info()))
+                import traceback
                self.resultQueue.put((request, traceback.format_exc()))
    def dismiss(self):
        """Sets a flag to tell the thread to exit when done with current job.
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -27,6 +27,7 @@ class Article(object):
    time_offset = datetime.now() - datetime.utcnow()
    def __init__(self, id, title, url, summary, published, content):
        self.downloaded = False
        self.id = id
        self.title = title
        self.url = url
@ -103,7 +104,7 @@ class Feed(object):
        if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
            self.articles.append(article)
        else:
-            self.logger.debug('Skipping article %s as it is too old.'%title)
+            self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
    def __iter__(self):
        return iter(self.articles)
@ -118,6 +119,12 @@ class Feed(object):
    def __str__(self):
        return repr(self)
    def __bool__(self):
        for article in self:
            if getattr(article, 'downloaded', False):
                return True
        return False
 def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -18,21 +18,24 @@ from libprs500.web.feeds.news import BasicNewsRecipe
 ''''''
 import sys, os, logging
-from libprs500.web.recipes import get_feed, compile_recipe
+from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
 from libprs500.web.fetch.simple import option_parser as _option_parser
 def option_parser(usage='''\
-%prog [options] ARG
+%%prog [options] ARG
-%prog parsers an online source of articles, like an RSS or ATOM feed and 
+%%prog parsers an online source of articles, like an RSS or ATOM feed and 
 fetches the article contents organized in a nice hierarchy.
 ARG can be one of:
-file name            - %prog will try to load a recipe from the file
+file name            - %%prog will try to load a recipe from the file
-builtin recipe title - %prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
+builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
-recipe as a string   - $prog will load the recipe directly from the string arg.
+recipe as a string   - %%prog will load the recipe directly from the string arg.
-'''):
+
 Available builtin recipes are:
 %s
 '''%(unicode(list(titles))[1:-1])):
    p = _option_parser(usage=usage)
    p.remove_option('--max-recursions')
    p.remove_option('--base-dir')
@ -86,7 +89,7 @@ def main(args=sys.argv, notification=None, handler=None):
        else:
            notification = no_progress_bar
-    if len(args) != 2:
+    if len(args) != 2 and opts.feeds is None:
        p.print_help()
        return 1
@ -96,11 +99,16 @@ def main(args=sys.argv, notification=None, handler=None):
    else:
        try:
            if os.access(args[1], os.R_OK):
-                recipe = compile_recipe(open(args[1]).read())
+                try:
                    recipe = compile_recipe(open(args[1]).read())
                except:
                    import traceback
                    traceback.print_exc()
                    return 1
            else:
-                raise Exception('')
+                raise Exception('not file')
        except:
-            recipe = get_feed(args[1])
+            recipe = get_builtin_recipe(args[1])
            if recipe is None:
                recipe = compile_recipe(args[1])
@ -111,9 +119,10 @@ def main(args=sys.argv, notification=None, handler=None):
        return 1
    if handler is None:
        from libprs500 import ColoredFormatter
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
-        handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+        handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
        logging.getLogger('feeds2disk').addHandler(handler)
    recipe = recipe(opts, p, notification)
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -17,11 +17,13 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, traceback, time
+import logging, os, cStringIO, time, itertools, traceback
 import urlparse
-from libprs500 import browser
+from libprs500 import browser, __appname__
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 from libprs500.ebooks.metadata.opf import OPFCreator
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.web.feeds import feed_from_xml, templates
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
 from libprs500.web.fetch.simple import RecursiveFetcher
@ -35,7 +37,10 @@ class BasicNewsRecipe(object):
    #: The title to use for the ebook
    #: @type: string    
-    title                 = 'Unknown News Source'    
+    title                 = _('Unknown News Source')
    #: The author of this recipe
    __author__            = _('Unknown')    
    #: Maximum number of articles to download from each feed
    #: @type: integer
@ -55,17 +60,18 @@ class BasicNewsRecipe(object):
    delay                 = 0
    #: Number of simultaneous downloads. Set to 1 if the server is picky.
    #: Automatically reduced to 1 if L{delay} > 0
    #: @type: integer
    simultaneous_downloads = 5
    #: Timeout for fetching files from server in seconds
    #: @type: integer
-    timeout               = 10
+    timeout               = 120
    #: The format string for the date shown on the first page
    #: By default: Day Name Day Number Month Name Year
    #: @type: string
-    timefmt               = ' %a, %d %b %Y'
+    timefmt               = ' [%a, %d %b %Y]'
    #: Max number of characters in the short description.
    #: @type: integer
@ -102,7 +108,7 @@ class BasicNewsRecipe(object):
    #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
    #: @type: list of strings
-    html2lrf_options   = []
+    html2lrf_options   = ['--page-break-before', '$']
    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
@ -114,9 +120,23 @@ class BasicNewsRecipe(object):
    #: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
    #: A common example::
    #:   remove_tags = [dict(name='div', attrs={'class':'advert'})]
-    #:   This will remove all <div class="advert"> tags and all their children from the downloaded HTML. 
+    #:   This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
    #: @type: list 
    remove_tags = []
    #: Remove all tags that occur after the specified tag. 
    #: For the format for specifying a tag see L{remove_tags}.
    #: For example, C{remove_tags_after = [dict(id='content')]} will remove all
    #: tags after the element with id C{content}.
    remove_tags_after = None
    #: Keep only the specified tags and their children. 
    #: For the format for specifying tags see L{remove_tags}.
    #: If this list is not empty, then the <body> element will be emptied and re-filled with
    #: the tags that match the entries in this list.
    #: @type: list 
    keep_only_tags = []
    #: List of regexp substitution rules to run on the downloaded HTML. Each element of the 
    #: list should be a two element tuple. The first element of the tuple should
    #: be a compiled regular expression and the second a callable that takes
@ -126,6 +146,13 @@ class BasicNewsRecipe(object):
    # See the built-in profiles for examples of these settings.
    def get_cover_url(self):
        '''
        Return a URL to the cover image for this issue or None.
        @rtype: string or None
        '''
        return getattr(self, 'cover_url', None)
    def get_feeds(self):
        '''
        Return a list of RSS feeds to fetch for this profile. Each element of the list
@ -156,7 +183,21 @@ class BasicNewsRecipe(object):
    def preprocess_html(self, soup):
        '''
-        This function is called with the source of each downloaded HTML file. 
+        This function is called with the source of each downloaded HTML file, before
        it is parsed for links and images. 
        It can be used to do arbitrarily powerful pre-processing on the HTML.
        @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} 
                     instance containing the downloaded HTML.
        @type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
        @return: It must return soup (after having done any needed preprocessing)
        @rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance 
        '''
        return soup
    def postprocess_html(self, soup):
        '''
        This function is called with the source of each downloaded HTML file, after
        it is parsed for links and images. 
        It can be used to do arbitrarily powerful pre-processing on the HTML.
        @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} 
                     instance containing the downloaded HTML.
@ -210,6 +251,7 @@ class BasicNewsRecipe(object):
        self.browser = self.get_browser()
        self.image_map, self.image_counter = {}, 1
        self.css_map = {}
        web2disk_cmdline = [ 'web2disk', 
            '--timeout', str(self.timeout),
@ -233,14 +275,18 @@ class BasicNewsRecipe(object):
            web2disk_cmdline.extend(['--filter-regexp', reg])
        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
-        self.web2disk_options.remove_tags = self.remove_tags
+        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
-        self.web2disk_options.preprocess_regexps = self.preprocess_regexps
+                      'preprocess_html', 'remove_tags_after', 'postprocess_html'):
-        self.web2disk_options.preprocess_html = self.preprocess_html
+            setattr(self.web2disk_options, extra, getattr(self, extra))
        if self.delay > 0:
            self.simultaneous_downloads = 1
        self.navbar = templates.NavBarTemplate()
        self.max_articles_per_feed -= 1
        self.html2lrf_options.append('--use-spine')
        self.failed_downloads = []
        self.partial_failures = []
    def download(self):
        '''
@ -250,9 +296,26 @@ class BasicNewsRecipe(object):
        @return: Path to index.html
        @rtype: string
        '''
-        self.report_progress(0, _('Initialized'))
+        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        res = self.build_index()
        self.cleanup()
        self.report_progress(1, _('Download finished'))
        if self.failed_downloads:
            self.logger.warning(_('Failed to download the following articles:'))
            for feed, article, debug in self.failed_downloads:
                self.logger.warning(article.title+_(' from ')+feed.title)
                self.logger.debug(article.url)
                self.logger.debug(debug)
        if self.partial_failures:
            self.logger.warning(_('Failed to download parts of the following articles:'))
            for feed, atitle, aurl, debug in self.partial_failures:
                self.logger.warning(atitle + _(' from ') + feed)
                self.logger.debug(aurl)
                self.logger.warning(_('\tFailed links:'))
                for l, tb in debug:
                    self.logger.warning(l)
                    self.logger.debug(tb) 
        return res
    def feeds2index(self, feeds):
@ -294,11 +357,14 @@ class BasicNewsRecipe(object):
        return logger, out
    def fetch_article(self, url, dir, logger):
-        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
+        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
-        return fetcher.start_fetch(url)
+        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res:
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
        return res, path, failures
    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
@ -331,58 +397,111 @@ class BasicNewsRecipe(object):
                req.stream = stream
                req.feed = feed
                req.article = article
                req.feed_dir = feed_dir
                self.jobs.append(req)
        self.jobs_done = 0
        tp = ThreadPool(self.simultaneous_downloads)
        for req in self.jobs:
            tp.putRequest(req, block=True, timeout=0)
        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
-                tp.poll(True)
+                tp.poll()
                time.sleep(0.1)
            except NoResultsPending:
                break
-        html = self.feed2index(feed)
+        for f, feed in enumerate(feeds):
-        open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
+            html = self.feed2index(feed)
            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
            open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)
        return index
    def download_cover(self):
        self.cover_path = None
        try:
            cu = self.get_cover_url()
        except Exception, err:
            cu = None
            self.logger.error(_('Could not download cover: %s')%str(err))
            self.logger.debug(traceback.format_exc())
        if cu is not None:
            ext = cu.rpartition('.')[-1]
            ext = ext.lower() if ext else 'jpg'
            self.report_progress(1, _('Downloading cover from %s')%cu)
            cpath = os.path.join(self.output_dir, 'cover.'+ext)
            cfile = open(cpath, 'wb')
            cfile.write(self.browser.open(cu).read())
            self.cover_path = cpath
-            
+    
    def create_opf(self, feeds, dir=None):
        if dir is None:
            dir = self.output_dir
        mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
        opf = OPFCreator(mi)
        opf_path = os.path.join(dir, 'index.opf')
        cpath = getattr(self, 'cover_path', None) 
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
        entries = ['index.html']
        for i, f in enumerate(feeds):
            entries.append('feed_%d/index.html'%i)
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(i, j)
                    entries.append('%sindex.html'%adir)
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
        opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
        opf.create_spine(entries)
        opf.write(open(opf_path, 'wb'))
    def article_downloaded(self, request, result):
-        index = os.path.join(os.path.dirname(result), 'index.html')
+        index = os.path.join(os.path.dirname(result[0]), 'index.html')
-        os.rename(result, index)
+        os.rename(result[0], index)
        src = open(index, 'rb').read().decode('utf-8')
        f, a = request.requestID
        soup = BeautifulSoup(src)
        body = soup.find('body')
        if body is not None:
            top    = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
            bottom = self.navbar.generate(True,  f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
            top    = BeautifulSoup(top).find('div')
            bottom = BeautifulSoup(bottom).find('div')
            body.insert(0, top)
            body.insert(len(body.contents), bottom)
            open(index, 'wb').write(unicode(soup).encode('utf-8'))
        article = request.article
-        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
+        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
-        article.url = result
+        article.url = result[0]
        article.downloaded = True
        article.sub_pages  = result[1][1:]
        self.jobs_done += 1
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
        if result[2]:
            self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
-    def error_in_article_download(self, request, exc_info):
+    def error_in_article_download(self, request, traceback):
        self.jobs_done += 1
-        self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
+        self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
-        self.logger.debug(traceback.format_exc(*exc_info))
+        debug = request.stream.getvalue().decode('utf-8', 'ignore')
-        self.logger.debug(request.stream.getvalue())
+        self.logger.debug(debug)
        self.logger.debug(traceback)
        self.logger.debug('\n')
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
-        
+        self.failed_downloads.append((request.feed.title, request.article, debug))
    def parse_feeds(self):
        '''
@ -404,5 +523,3 @@ class BasicNewsRecipe(object):
                                              max_articles_per_feed=self.max_articles_per_feed))
        return parsed_feeds
--- a/src/libprs500/web/feeds/recipes/init.py
+++ b/src/libprs500/web/feeds/recipes/init.py
@ -0,0 +1,79 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Builtin recipes.
 '''
 recipes = ['newsweek']
 import re
 from libprs500.web.feeds.news import BasicNewsRecipe
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
 from libprs500.ebooks.lrf.web import available_profiles
 basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
 basic_recipe_names = (i.__name__ for i in basic_recipes)
 #: Compiled builtin recipe/profile classes
 def load_recipe(module, package='libprs500.web.feeds.recipes'):
    module = __import__(package+'.'+module, fromlist=[''])
    for attr in dir(module):
        obj = getattr(module, attr)
        if type(obj) is not type:
            continue
        recipe = False
        for b in obj.__bases__:
            if b in basic_recipes:
                recipe = True
                break
        if not recipe:
            continue
        if obj not in basic_recipes:
            return obj
 recipes = [load_recipe(i) for i in recipes]
 def compile_recipe(src):
    '''
    Compile the code in src and return the first object that is a recipe or profile.
    @return: Recipe/Profile class or None, if no such class was found in C{src} 
    '''
    locals = {}
    exec src in globals(), locals
    for obj in locals.values():
        if type(obj) is type and obj.__name__ not in basic_recipe_names:
            for base in obj.__bases__:
                if base in basic_recipes:
                    return obj
    return None
 def get_builtin_recipe(title):
    '''
    Return a builtin recipe/profile class whoose title == C{title} or None if no such
    recipe exists.
    @type title: string
    @rtype: class or None
    '''
    for r in recipes:
        if r.title == title:
            return r
 titles = set([r.title for r in recipes])
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -0,0 +1,90 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import re
 from libprs500.web.feeds.news import BasicNewsRecipe
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 class Newsweek(BasicNewsRecipe):
    title      = 'Newsweek'
    __author__ = 'Kovid Goyal'
    feeds = [
             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
             'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
             ('Politics', 'http://feeds.newsweek.com/headlines/politics'),
             ('Health', 'http://feeds.newsweek.com/headlines/health'),
             ('Business', 'http://feeds.newsweek.com/headlines/business'),
             ('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
             'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
             'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria', 
             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
             'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill', 
             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
             ]
    # For testing
    feeds = feeds[:2]
    max_articles_per_feed = 1
    keep_only_tags = [dict(name='div', id='content')]
    remove_tags = [
        dict(name=['script',  'noscript']),
        dict(name='div',  attrs={'class':['ad', 'SocialLinks', 'SocialLinksDiv', 'channel', 'bot', 'nav', 'top', 'EmailArticleBlock']}),
        dict(name='div',  attrs={'class':re.compile('box')}),
        dict(id=['ToolBox', 'EmailMain', 'EmailArticle', ])
    ]
    recursions = 1
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    def postprocess_html(self,  soup):
        divs = list(soup.findAll('div', 'pagination'))
        divs[0].extract()
        if len(divs) > 1:
            soup.find('body')['style'] = 'page-break-after:avoid'
            divs[1].extract()            
            h1 = soup.find('h1')
            if h1:
                h1.extract()
            ai = soup.find('div', 'articleInfo')
            ai.extract()
        else:
            soup.find('body')['style'] = 'page-break-before:always; page-break-after:avoid;'
        return soup
    def get_current_issue(self):
        from urllib2 import urlopen # For some reason mechanize fails
        home = urlopen('http://www.newsweek.com').read() 
        soup = BeautifulSoup(home)
        img  = soup.find('img', alt='Current Magazine')
        if img and img.parent.has_key('href'):
            return urlopen(img.parent['href']).read()
    def get_cover_url(self):
        ci = self.get_current_issue()
        if ci is not None:
            soup = BeautifulSoup(ci)
            img = soup.find(alt='Cover')
            if img is not None and img.has_key('src'):
                small = img['src']
                return small.replace('coversmall', 'coverlarge')
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -104,7 +104,7 @@ class IndexTemplate(Template):
        <p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
        <ul>
            <py:for each="i, feed in enumerate(feeds)">
-            <li id="feed_${str(i)}">
+            <li py:if="feed" id="feed_${str(i)}">
                <a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
            </li>
            </py:for>
@ -136,7 +136,7 @@ class FeedTemplate(Template):
            ${style}
        </style>
    </head>
-    <body>
+    <body style="page-break-before:always">
        <h2>${feed.title}</h2>
        <py:if test="feed.image">
        <div class="feed_image">
@ -144,7 +144,7 @@ class FeedTemplate(Template):
        </div>
        </py:if>
        <ul>
-            <py:for each="i, article in enumerate(feed)">
+            <py:for each="i, article in enumerate(feed.articles)">
            <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
                <a class="article" href="${article.url}">${article.title}</a>
                <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -17,12 +17,12 @@ Fetch a webpage and its links recursively. The webpages are saved to disk in
 UTF-8 encoding with any charset declarations removed.
 '''
 from __future__ import with_statement
-import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading
+import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname
 from httplib import responses
 from libprs500 import setup_cli_handlers, browser, sanitize_file_name, OptionParser
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Tag
 from libprs500.ebooks.chardet import xml_to_unicode
 class FetchError(Exception):
@ -37,10 +37,11 @@ def basename(url):
    return res
 def save_soup(soup, target):
-    for meta in soup.findAll('meta', content=True):
+    nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
    for meta in soup.find('meta', content=True):
        if 'charset' in meta['content']:
-            meta.extract()
+            meta.replaceWith(nm)
-    f = codecs.open(target, 'w', 'utf8')
+    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))
    f.close()
@ -55,7 +56,7 @@ class RecursiveFetcher(object):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
-    def __init__(self, options, logger, image_map={}):
+    def __init__(self, options, logger, image_map={}, css_map={}):
        self.logger = logger
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
        if not os.path.exists(self.base_dir):
@ -74,20 +75,44 @@ class RecursiveFetcher(object):
        self.filemap = {}
        self.imagemap = image_map
        self.imagemap_lock = threading.RLock()
-        self.stylemap = {}
+        self.stylemap = css_map
        self.stylemap_lock = threading.RLock()
        self.downloaded_paths = []
        self.current_dir = self.base_dir
        self.files = 0
        self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
        self.remove_tags         = getattr(options, 'remove_tags', [])
        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
        self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
        if self.keep_only_tags:
            body = Tag(soup, 'body')
            for spec in self.keep_only_tags:
                for tag in soup.find('body').findAll(**spec):
                    body.insert(len(body.contents), tag)
            soup.find('body').replaceWith(body)
        if self.remove_tags_after is not None:
            tag = soup.find(**self.remove_tags_after)
            while tag is not None and tag.name != 'body':
                after = tag.nextSibling
                while after is not None:
                    ns = after.nextSibling
                    after.extract()
                    after = ns
                tag = tag.parent
        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
@ -105,7 +130,12 @@ class RecursiveFetcher(object):
        except urllib2.URLError, err:
            if hasattr(err, 'code') and responses.has_key(err.code):
                raise FetchError, responses[err.code]
-            raise err
+            if err.reason[0] == 104: # Connection reset by peer
                self.logger.debug('Connection reset by peer retrying in 1 second.')
                time.sleep(1)
                f = self.browser.open(url)
            else: 
                raise err
        finally:
            self.last_fetch_at = time.time()
        return f
@ -146,9 +176,10 @@ class RecursiveFetcher(object):
                iurl = tag['href']
                if not urlparse.urlsplit(iurl).scheme:
                    iurl = urlparse.urljoin(baseurl, iurl, False)
-                if self.stylemap.has_key(iurl):
+                with self.stylemap_lock:
-                    tag['href'] = self.stylemap[iurl]
+                    if self.stylemap.has_key(iurl):
-                    continue
+                        tag['href'] = self.stylemap[iurl]
                        continue
                try:
                    f = self.fetch_url(iurl)
                except Exception, err:
@ -157,7 +188,8 @@ class RecursiveFetcher(object):
                    continue
                c += 1
                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
-                self.stylemap[iurl] = stylepath
+                with self.stylemap_lock:
                    self.stylemap[iurl] = stylepath
                open(stylepath, 'wb').write(f.read())
                tag['href'] = stylepath
            else:
@ -168,9 +200,10 @@ class RecursiveFetcher(object):
                        iurl = m.group(1)
                        if not urlparse.urlsplit(iurl).scheme:
                            iurl = urlparse.urljoin(baseurl, iurl, False)
-                        if self.stylemap.has_key(iurl):
+                        with self.stylemap_lock:
-                            ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
+                            if self.stylemap.has_key(iurl):
-                            continue
+                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                                continue
                        try:
                            f = self.fetch_url(iurl)
                        except Exception, err:
@ -179,7 +212,8 @@ class RecursiveFetcher(object):
                            continue
                        c += 1
                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
-                        self.stylemap[iurl] = stylepath
+                        with self.stylemap_lock:
                            self.stylemap[iurl] = stylepath
                        open(stylepath, 'wb').write(f.read())
                        ns.replaceWith(src.replace(m.group(1), stylepath))
@ -214,7 +248,7 @@ class RecursiveFetcher(object):
            open(imgpath, 'wb').write(f.read())
            tag['src'] = imgpath
-    def absurl(self, baseurl, tag, key): 
+    def absurl(self, baseurl, tag, key, filter=True): 
        iurl = tag[key]
        parts = urlparse.urlsplit(iurl)
        if not parts.netloc and not parts.path:
@ -224,7 +258,7 @@ class RecursiveFetcher(object):
        if not self.is_link_ok(iurl):
            self.logger.debug('Skipping invalid link: %s', iurl)
            return None
-        if not self.is_link_wanted(iurl):
+        if filter and not self.is_link_wanted(iurl):
            self.logger.debug('Filtered link: '+iurl)
            return None
        return iurl
@ -256,12 +290,12 @@ class RecursiveFetcher(object):
        prev_dir = self.current_dir
        try:
            self.current_dir = diskpath
-            for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
+            for tag in soup.findAll('a', href=True):
                if self.show_progress:
                    print '.',
                    sys.stdout.flush()
                sys.stdout.flush()
-                iurl = self.absurl(baseurl, tag, 'href')
+                iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
@ -293,6 +327,7 @@ class RecursiveFetcher(object):
                        self.process_stylesheets(soup, f.geturl())
                    res = os.path.join(linkdiskpath, basename(iurl))
                    self.downloaded_paths.append(res)
                    self.filemap[nurl] = res
                    if recursion_level < self.max_recursions:
                        self.logger.debug('Processing links...')
@ -301,9 +336,11 @@ class RecursiveFetcher(object):
                        self.process_return_links(soup, iurl) 
                        self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
-                    save_soup(soup, res)
+                    save_soup(self.postprocess_html_ext(soup), res)
                    self.localize_link(tag, 'href', res)
                except Exception, err:
                    self.failed_links.append((iurl, traceback.format_exc()))
                    self.logger.warning('Could not fetch link %s', iurl)
                    self.logger.debug('Error: %s', str(err), exc_info=True)
                finally:
--- a/src/libprs500/web/recipes.py
+++ b/src/libprs500/web/recipes.py
@ -1,63 +0,0 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Contains recipes for various common news sources and websites.
 '''
 import re
 from libprs500.web.feeds.news import BasicNewsRecipe
 _basic_recipes = (BasicNewsRecipe,)
 _basic_recipe_names = (i.__name__ for i in _basic_recipes)
 def compile_recipe(src):
    '''
    Compile the code in src and return the first object that is
    '''
    locals = {}
    exec src in globals(), locals
    for obj in locals.values():
        if type(obj) is type and obj.__name__ not in _basic_recipe_names:
            for base in obj.__bases__:
                if base in _basic_recipes:
                    return obj
    return None
 def get_feed(title):
    '''
    Return a builtin recipe class whoose title == C{title} or None if no such
    recipe exists.
    @type title: string
    @rtype: class or None
    '''
    if isinstance(_feeds[0], basestring):
        for i, val in enumerate(_feeds):
            recipe = compile_recipe(val)
            if recipe is None:
                raise RuntimeError('The builtin Recipe #%d is invalid.'%i)
            _feeds[i] = recipe
    for recipe in _feeds:
        if recipe.title == title:
            return recipe
    return None
 #: Recipes to be used with feeds2disk
 _feeds = ['class Temp(BasicNewsRecipe):\n\ttitle="temp"']