Add support for user profiles in web2lrf

2025-06-23 15:30:45 -04:00 · 2007-10-27 00:34:58 +00:00 · 2007-10-27 00:34:58 +00:00 · 8799a6f3f2
commit 8799a6f3f2
parent 275b59a2e7
11 changed files with 500 additions and 535 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -13,134 +13,3 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-import os, time, calendar, operator, re
-
-from libprs500 import iswindows
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
-from htmlentitydefs import name2codepoint
-
-DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
-MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
-FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, 
-                      July=7, August=8, September=9, October=10, 
-                      November=11, December=12)
-
-def strptime(src):
-    src = src.strip().split()
-    src[0] = str(DAY_MAP[src[0][:-1]])+','
-    try:
-        src[2] = str(MONTH_MAP[src[2]])
-    except KeyError:
-        src[2] = str(FULL_MONTH_MAP[src[2]])
-    return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
-
-def process_html_description(tag):
-        src = '\n'.join(tag.contents)
-        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
-        for e in replaced_entities:
-            ent = '&'+e+';'
-            src = src.replace(ent, unichr(name2codepoint[e]))
-        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
-
-def parse_feeds(feeds, browser, print_version, 
-                max_articles_per_feed=10, 
-                html_description=False,
-                oldest_article=7):
-    '''
-    @param print_version: Callable that takes a url string and returns the url to 
-                          printable version of the article pointed to by the original url.
-    @param max_articles_per_feed: Maximum number of articles to download from each feed
-    @param html_description: If true the atricles descriptions are processed as HTML
-    @param oldest_article: A number in days. No articles older than now - oldest_aticle 
-                           will be downloaded.  
-    '''
-    articles = {}
-    for title, url in feeds:
-        try:
-            src = browser.open(url).read()
-        except Exception, err:
-            print 'Could not fetch feed: %s\nError: %s'%(url, err)
-            continue
-        
-        articles[title] = []
-        soup = BeautifulStoneSoup(src)
-        for item in soup.findAll('item'):
-            try:
-                pubdate = item.find('pubdate').string
-                if not pubdate:
-                    continue
-                pubdate = pubdate.replace('+0000', 'GMT')
-                d = { 
-                    'title'    : item.find('title').string,                 
-                    'url'      : print_version(item.find('guid').string),
-                    'timestamp': calendar.timegm(strptime(pubdate)),
-                    'date'     : pubdate
-                    }
-                delta = time.time() - d['timestamp']
-                if delta > oldest_article*3600*24:
-                    continue
-                 
-            except Exception, err:
-                continue
-            try:
-                desc = item.find('description')
-                d['description'] = process_html_description(desc) if  html_description else desc.string                    
-            except:
-                d['description'] = ''
-            articles[title].append(d)
-        articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
-        articles[title][max_articles_per_feed:] = []
-        for item in articles[title]:
-            item.pop('timestamp')
-        if not articles[title]:
-            articles.pop(title)
-    return articles
-
-
-def build_index(title, articles, dir):
-    '''Build an RSS based index.html'''
-
-    def build_sub_index(title, items):
-        ilist = ''
-        li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
-            u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
-        for item in items:
-            ilist += li%item
-        return u'''\
-        <html>
-        <body>
-        <h2>%(title)s</h2>
-        <ul>
-        %(items)s
-        </ul>
-        </body>
-        </html>
-        '''%dict(title=title, items=ilist.rstrip())        
-    
-    cnum = 0
-    clist = ''
-    categories = articles.keys()
-    categories.sort()
-    for category in categories:
-        cnum  += 1
-        cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
-        prefix = 'file:' if iswindows else ''
-        clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
-        src = build_sub_index(category, articles[category])
-        open(cfile, 'wb').write(src.encode('utf-8'))        
-    
-    src = '''\
-    <html>
-    <body>
-    <h1>%(title)s</h1>
-    <div style='text-align: right; font-weight: bold'>%(date)s</div>
-    <ul>
-    %(categories)s
-    </ul>
-    </body>
-    </html>
-    '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), 
-             categories=clist, title=title)
-    index = os.path.join(dir, 'index.html')
-    open(index, 'wb').write(src.encode('utf-8'))
-    return index
--- a/src/libprs500/ebooks/lrf/web/bbc.py
+++ b/src/libprs500/ebooks/lrf/web/bbc.py
@ -1,53 +0,0 @@
-##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
-##    This program is free software; you can redistribute it and/or modify
-##    it under the terms of the GNU General Public License as published by
-##    the Free Software Foundation; either version 2 of the License, or
-##    (at your option) any later version.
-##
-##    This program is distributed in the hope that it will be useful,
-##    but WITHOUT ANY WARRANTY; without even the implied warranty of
-##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##    GNU General Public License for more details.
-##
-##    You should have received a copy of the GNU General Public License along
-##    with this program; if not, write to the Free Software Foundation, Inc.,
-##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-
-import tempfile, shutil, os
-from libprs500.ebooks.lrf.web import build_index, parse_feeds
-
-RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm'
-
-from libprs500 import __appname__, iswindows, browser
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup
-
-
-def get_feeds(browser):
-    src = browser.open(RSS).read()
-    soup = BeautifulSoup(src[src.index('<html'):])
-    feeds = []
-    ul =  soup.find('ul', attrs={'class':'rss'})
-    for link in ul.findAll('a'):
-        feeds.append((link.string, link['href']))
-    return feeds
-
-def initialize(profile):
-    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
-    profile['browser'] = browser()
-    feeds = get_feeds(profile['browser'])
-    articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/'))
-    index = build_index('The BBC', articles, profile['temp dir'])
-    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
-    profile['timefmt'] = ' [%a, %d %b, %Y]'
-    profile['max_recursions'] =  2                 
-    profile['title']          = 'The BBC'
-    profile['no_stylesheets'] = True
-    
-def finalize(profile):
-    if os.path.isdir(profile['temp dir']):
-        shutil.rmtree(profile['temp dir'])
-
-
-
-    
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -14,43 +14,48 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Convert known websites into LRF files.'''

-import sys, time, tempfile, shutil, os, logging
+import sys, time, tempfile, shutil, os, logging, imp, inspect
 from urlparse import urlsplit

 from libprs500 import __appname__, setup_cli_handlers, CommandLineError
 from libprs500.ebooks.lrf import option_parser as lrf_option_parser
 from libprs500.ebooks.lrf.html.convert_from import process_file
-from libprs500.ebooks.lrf.web.profiles import profiles
+
 from libprs500.web.fetch.simple import create_fetcher

-available_profiles = profiles.keys()
-available_profiles.remove('default')
-available_profiles = ' '.join(available_profiles)
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+from libprs500.ebooks.lrf.web.profiles.nytimes import NYTimes
+from libprs500.ebooks.lrf.web.profiles.bbc import BBC
+from libprs500.ebooks.lrf.web.profiles.newsweek import Newsweek
+
+builtin_profiles   = [NYTimes, BBC, Newsweek]
+available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] 

 def option_parser():
    parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
                          '''%prog downloads a site from the web and converts it '''
                          '''into a LRF file for use with the SONY Reader. '''
-                          '''website_profile is one of '''+available_profiles+\
+                          '''website_profile is one of '''+str(available_profiles)+\
                          ''' If you specify a website_profile of default or do not specify '''
                          '''it, you must specify the --url option.'''
                          )
    
    parser.add_option('-u', '--url', dest='url', default=None,  
                      help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
-    
+    parser.add_option('--user-profile', default=None,
+                      help='Path to a python file containing a user created profile.')
    parser.add_option('--username', dest='username', default=None, 
                      help='Specify the username to be used while downloading. Only used if the profile supports it.')
    parser.add_option('--password', dest='password', default=None,
                      help='Specify the password to be used while downloading. Only used if the profile supports it.')
-    parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %(timeout)s s'%profiles['default'],
+    parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
                      default=None, type='int', dest='timeout')
-    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %(max_recursions)s'%profiles['default'],
+    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
                      default=None, type='int', dest='max_recursions')
    parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
-                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %(max_files)s'%profiles['default'])
+                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
    parser.add_option('--delay', default=None, dest='delay', type='int',
-                      help='Minimum interval in seconds between consecutive fetches. Default is %(delay)s s'%profiles['default'])
+                      help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
    parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
                      help='Do not download CSS stylesheets.', dest='no_stylesheets')    
    
@ -85,45 +90,58 @@ def process_profile(args, options, logger=None):
            level = logging.DEBUG if options.verbose else logging.INFO
            logger = logging.getLogger('web2lrf')
            setup_cli_handlers(logger, level)
+        index = -1
+        if options.user_profile is not None:
+            path = os.path.abspath(options.user_profile)
+            name = os.path.splitext(os.path.basename(path))[0]
+            res = imp.find_module(name, [os.path.dirname(path)])
+            module =  imp.load_module(name, *res)
+            classes = inspect.getmembers(module, 
+                lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
+                           and x is not DefaultProfile)
+            if not classes:
+                raise CommandLineError('Invalid user profile '+path)
+            builtin_profiles.append(classes[0][1])
+            available_profiles.append(name)
+            if len(args) < 2:
+                args.append('')
+            args[1] = name
        if len(args) == 2:
-            if not profiles.has_key(args[1]):
-                raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
-        profile = profiles[args[1]] if len(args) == 2 else profiles['default']
-        profile['username'] = options.username
-        profile['password'] = options.password
-        if profile.has_key('initialize'):
-            profile['initialize'](profile)
-        if profile.has_key('browser'):
-            options.browser = profile['browser']
+            try:
+                index = available_profiles.index(args[1])
+            except ValueError:
+                raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
+        profile = DefaultProfile if index == -1 else builtin_profiles[index]
+        profile = profile(options.username, options.password)
+        if profile.browser is not None:
+            options.browser = profile.browser
        
        for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
            val = getattr(options, opt)
            if val is None:
-                setattr(options, opt, profile[opt])
+                setattr(options, opt, getattr(profile, opt))
        
        if not options.url:
-            options.url = profile['url']            
+            options.url = profile.url            
        
        if not options.url:
            raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
        
        if not options.title:
-            title = profile['title']
+            title = profile.title
            if not title:
                title = urlsplit(options.url).netloc
-            options.title = title + time.strftime(profile['timefmt'], time.localtime())
+            options.title = title + time.strftime(profile.timefmt, time.localtime())
        
-        options.match_regexps += profile['match_regexps']
-        options.preprocess_regexps = profile['preprocess_regexps']
-        options.filter_regexps += profile['filter_regexps']
+        options.match_regexps += profile.match_regexps
+        options.preprocess_regexps = profile.preprocess_regexps
+        options.filter_regexps += profile.filter_regexps
        if len(args) == 2 and args[1] != 'default':
            options.anchor_ids = False
        
        htmlfile, tdir = fetch_website(options, logger)
        create_lrf(htmlfile, options, logger)
    finally:
-        if profile.has_key('finalize'):
-            profile['finalize'](profile)
        if tdir and os.path.isdir(tdir):
            shutil.rmtree(tdir)
    
--- a/src/libprs500/ebooks/lrf/web/economist.py
+++ b/src/libprs500/ebooks/lrf/web/economist.py
@ -1,81 +0,0 @@
-##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
-##    This program is free software; you can redistribute it and/or modify
-##    it under the terms of the GNU General Public License as published by
-##    the Free Software Foundation; either version 2 of the License, or
-##    (at your option) any later version.
-##
-##    This program is distributed in the hope that it will be useful,
-##    but WITHOUT ANY WARRANTY; without even the implied warranty of
-##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##    GNU General Public License for more details.
-##
-##    You should have received a copy of the GNU General Public License along
-##    with this program; if not, write to the Free Software Foundation, Inc.,
-##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-
-import tempfile, shutil, os
-from libprs500.ebooks.lrf.web import build_index, parse_feeds
-
-RSS = 'http://economist.com/rss/'
-TITLES = [
-          'The world this week',
-          'Letters',
-          'Briefings',
-          'Special reports',
-          'Britain',
-          'Europe',
-          'United States',
-          'The Americas',
-          'Middle East and Africa',
-          'Asia',
-          'International',
-          'Business',
-          'Finance and economics',
-          'Science and technology',
-          'Books and arts',
-          'Indicators'
-          ]
-
-from libprs500 import __appname__, iswindows, browser
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup
-
-def print_version(url):
-    return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
-
-def get_feeds(browser):
-    src = browser.open(RSS).read()
-    soup = BeautifulSoup(src)
-    feeds = []
-    for ul in soup.findAll('ul'):
-        lis =  ul.findAll('li')
-        try:
-            title, link = lis[0], lis[1]
-        except IndexError:
-            continue
-        title = title.string
-        if title:
-            title = title.strip()
-        if title not in TITLES:
-            continue
-        a = link.find('a')
-        feeds.append((title, a['href'].strip()))
-        
-    return feeds
-            
-def initialize(profile):
-    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
-    profile['browser'] = browser()
-    feeds = get_feeds(profile['browser'])
-    articles = parse_feeds(feeds, profile['browser'], print_version, max_articles_per_feed=20)
-    index = build_index('The Economist', articles, profile['temp dir'])
-    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
-    profile['timefmt'] = ' [%d %b %Y]'
-    profile['max_recursions'] =  3                
-    profile['title']          = 'The Economist'
-    profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
-        
-def finalize(profile):
-    if os.path.isdir(profile['temp dir']):
-        shutil.rmtree(profile['temp dir'])
-    
--- a/src/libprs500/ebooks/lrf/web/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/nytimes.py
@ -1,73 +0,0 @@
-##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
-##    This program is free software; you can redistribute it and/or modify
-##    it under the terms of the GNU General Public License as published by
-##    the Free Software Foundation; either version 2 of the License, or
-##    (at your option) any later version.
-##
-##    This program is distributed in the hope that it will be useful,
-##    but WITHOUT ANY WARRANTY; without even the implied warranty of
-##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##    GNU General Public License for more details.
-##
-##    You should have received a copy of the GNU General Public License along
-##    with this program; if not, write to the Free Software Foundation, Inc.,
-##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-'''New York Times from RSS feeds.'''
-import os, tempfile, shutil
-
-from libprs500 import __appname__, iswindows, browser
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup
-from libprs500.ebooks.lrf.web import build_index, parse_feeds
-
-RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
-LOGIN = 'http://www.nytimes.com/auth/login'
-
-def get_feeds(browser):
-    src = browser.open(RSS).read()
-    soup = BeautifulSoup(src[src.index('<html'):])
-    feeds = []
-    for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
-        if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
-                                 'Dining & Wine', 'Home & Garden', 'Multimedia',
-                                 'Most E-mailed Articles', 
-                                 'Automobiles', 'Fashion & Style', 'Television News',
-                                 'Education']:
-            feeds.append((link['title'], link['href']))
-        #else: print link['title']
-    
-    return feeds
-
-def initialize(profile):
-    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
-    profile['browser'] = login(profile)
-    feeds = get_feeds(profile['browser'])
-    articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print',
-                           oldest_article=2)
-    index = build_index('The New York Times', articles, profile['temp dir'])
-    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
-    profile['timefmt'] = ' [%a, %d %b, %Y]'
-    profile['max_recursions'] =  2                 
-    profile['title']          = 'The New York Times'
-    
-    
-def finalize(profile):
-    if os.path.isdir(profile['temp dir']):
-        shutil.rmtree(profile['temp dir'])
- 
-
-def login(profile):
-    br = browser()
-    if profile['username'] and profile['password']:
-        br.open(LOGIN)
-        br.select_form(name='login')
-        br['USERID']   = profile['username']
-        br['PASSWORD'] = profile['password']
-        br.submit()
-    return br
-      
-
-if __name__ == '__main__':
-    feeds = get_feeds()
-    articles = parse_feeds(feeds)
-    print articles
-
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@ -1,136 +0,0 @@
-##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
-##    This program is free software; you can redistribute it and/or modify
-##    it under the terms of the GNU General Public License as published by
-##    the Free Software Foundation; either version 2 of the License, or
-##    (at your option) any later version.
-##
-##    This program is distributed in the hope that it will be useful,
-##    but WITHOUT ANY WARRANTY; without even the implied warranty of
-##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-##    GNU General Public License for more details.
-##
-##    You should have received a copy of the GNU General Public License along
-##    with this program; if not, write to the Free Software Foundation, Inc.,
-##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-'''Profiles for known websites.'''
-import re
-
-from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
-from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
-from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
-from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
-from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize
-from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize
-from libprs500.ebooks.lrf.web.economist import initialize as economist_initialize
-from libprs500.ebooks.lrf.web.economist import finalize as economist_finalize
-
-
-profiles = {
-            'default' : {
-                         'url'               : '',    # The URL of the website
-                         'title'             : '',    # The title to use for the LRF file
-                         'max_recursions'    : 1,     # Number of levels of links to follow
-                         'max_files'         : 1000,  # Maximum number of files to download
-                         'delay'             : 0,     # Delay between consecutive downloads
-                         'timeout'           : 10,    # Timeout for fetching files from server in seconds
-                         'timefmt'           : ' [%a %d %b %Y]',
-                         'no_stylesheets'    : False, # Download stylesheets 
-                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
-                         'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore
-                         # Only one of match_regexps or filter_regexps should be defined
-                         'html2lrf_options'  : [],    # List of options to pass to html2lrf
-                         'preprocess_regexps': [],    # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
-                         # See the profiles below for examples of these settings. 
-                       },
-                       
-            'nytimes' : {
-                         'initialize'          : nytimes_initialize,
-                         'finalize'            : nytimes_finalize,
-                         
-                         'preprocess_regexps' :
-                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
-                          [
-                           # Remove header bar
-                           (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
-                           (r'<div class="articleTools">.*></ul>', lambda match : ''),
-                           # Remove footer bar
-                           (r'<\!--  end \#article -->.*', lambda match : '</body></html>'),
-                           (r'<div id="footer">.*', lambda match : '</body></html>'),
-                           ]
-                          ],
-                         },
-                         
-            'bbc'     : {
-                          'initialize'          : bbc_initialize,
-                          'finalize'            : bbc_finalize,
-                          'preprocess_regexps' :
-                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
-                          [
-                           # Remove footer from individual stories
-                           (r'<div class=.footer.>.*?Published', 
-                            lambda match : '<p></p><div class="footer">Published'),
-                           # Add some style info in place of disabled stylesheet
-                           (r'<link.*?type=.text/css.*?>', lambda match :
-                            '''<style type="text/css">
-                                .headline {font-size: x-large;}
-                                .fact { padding-top: 10pt  }
-                                </style>'''),
-                           ]
-                          ],
-                          },
-            
-            'newsweek' : {
-                          'initialize'          : newsweek_initialize,
-                          'finalize'            : newsweek_finalize,
-                          'preprocess_regexps'  :
-                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
-                          [
-                           # Make fonts larger
-                           (r'<style.*?\.copyright.*?</style>', 
-                            lambda match : \
-                        '''<style type="text/css">'''
-                        '''updateTime{font:small Arial;color:#000000;}'''
-                        '''.credit{font:small Arial;color:#999999;}'''
-                        '''.head{font:bold 18pt x-large;color:#CC0000;}'''
-                        '''.abstract{font:14pt large Verdana;color:#000000;}'''
-                        '''.title{font:bold;color:#000000;}'''
-                        '''.source{font:bold small Verdana;color:#CC0000;}'''
-                        '''.footerLink{font:bold Verdana;color:#000000;}'''
-                        '''.caption{font: Verdana;color:#000000;}'''
-                        '''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
-                        '''.copyright{font-style:italic;}'''
-                        '''</style>'''
-                            ),
-                           ]
-                          ],
-                          }, 
-                          
-            'economist' : {
-                           'initialize'          : economist_initialize,
-                           'finalize'            : economist_finalize,
-                           'preprocess_regexps' :
-                           [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
-                            [
-                             # Remove advert
-                             (r'<noscript.*?</noscript>', lambda match: ''),
-                             ]
-                            ], 
-                           },                                   
-            }
-
-for key in profiles.keys():
-    if key == 'default':
-        continue
-    newd = profiles['default'].copy()
-    newd.update(profiles[key])
-    profiles[key] = newd
-
-def profile_to_command_line_options(profile):
-    args = []
-    args.append('--max-recursions='+str(profile['max_recursions']))
-    args.append('--delay='+str(profile['delay']))
-    for i in profile['match_regexps']:
-        args.append('--match-regexp="'+i+'"')
-    for i in profile['filter_regexps']:
-        args.append('--filter-regexp="'+i+'"')
-    return args
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -0,0 +1,227 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+'''
+
+import tempfile, time, calendar, re, operator
+from htmlentitydefs import name2codepoint
+
+from libprs500 import __appname__, iswindows, browser
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+
+
+class DefaultProfile(object):
+    
+    url                   = ''    # The URL of the website
+    title                 = 'Default Profile'    # The title to use for the LRF file
+    max_articles_per_feed = 10    # Maximum number of articles to download from each feed 
+    html_description      = False # If True process the <description> element of the feed as HTML
+    oldest_article        = 7     # How many days old should the oldest article downloaded from the feeds be?
+    max_recursions        = 1     # Number of levels of links to follow
+    max_files             = 3000  # Maximum number of files to download
+    delay                 = 0     # Delay between consecutive downloads
+    timeout               = 10    # Timeout for fetching files from server in seconds
+    timefmt               = ' [%a %d %b %Y]' # The format of the date shown on the first page
+    no_stylesheets        = False # Download stylesheets only if False 
+    match_regexps         = []    # List of regular expressions that determines which links to follow
+    filter_regexps        = []    # List of regular expressions that determines which links to ignore
+    # Only one of match_regexps or filter_regexps should be defined
+    
+    html2lrf_options   = []    # List of options to pass to html2lrf
+    # List of regexp substitution rules to run on the downloaded HTML. Each element of the 
+    # list should be a two element tuple. THe first element of the tuple should
+    # be a compiled regular expression and the second a callable that takes
+    # a single match object and returns a string to replace the match.
+    preprocess_regexps = []
+    
+    # See the built-in profiles for examples of these settings.
+    
+    def get_feeds(self):
+        '''
+        Return a list of RSS feeds to fetch for this profile. Each element of the list
+        must be a 2-element tuple of the form (title, url).
+        '''
+        raise NotImplementedError
+    
+    @classmethod
+    def print_version(cls, url):
+        '''
+        Takea a URL pointing to an article and returns the URL pointing to the
+        print version of the article.
+        '''
+        return url
+    
+    @classmethod
+    def get_browser(cls):
+        '''
+        Return a browser instance used to fetch documents from the web.
+        
+        If your profile requires that you login first, override this method
+        in your subclass. See for example the nytimes profile.
+        '''
+        return browser()
+    
+    ########################################################################
+    ###################### End of customizable portion #####################
+    ########################################################################
+    
+    
+    def __init__(self, username=None, password=None):
+        self.username = username
+        self.password = password
+        self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
+        self.browser = self.get_browser()
+        self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
+    
+    def __del__(self):
+        import os, shutil
+        if os.path.isdir(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+    
+    def build_index(self):
+        '''Build an RSS based index.html'''
+        import os
+        articles = self.parse_feeds()
+        
+    
+        def build_sub_index(title, items):
+            ilist = ''
+            li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
+                u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
+            for item in items:
+                ilist += li%item
+            return u'''\
+            <html>
+            <body>
+            <h2>%(title)s</h2>
+            <ul>
+            %(items)s
+            </ul>
+            </body>
+            </html>
+            '''%dict(title=title, items=ilist.rstrip())        
+        
+        cnum = 0
+        clist = ''
+        categories = articles.keys()
+        categories.sort()
+        for category in categories:
+            cnum  += 1
+            cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
+            prefix = 'file:' if iswindows else ''
+            clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
+            src = build_sub_index(category, articles[category])
+            open(cfile, 'wb').write(src.encode('utf-8'))        
+        
+        src = '''\
+        <html>
+        <body>
+        <h1>%(title)s</h1>
+        <div style='text-align: right; font-weight: bold'>%(date)s</div>
+        <ul>
+        %(categories)s
+        </ul>
+        </body>
+        </html>
+        '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), 
+                 categories=clist, title=self.title)
+        index = os.path.join(self.temp_dir, 'index.html')
+        open(index, 'wb').write(src.encode('utf-8'))
+        return index
+
+    
+    def parse_feeds(self):
+        feeds = self.get_feeds()
+        articles = {}
+        for title, url in feeds:
+            try:
+                src = self.browser.open(url).read()
+            except Exception, err:
+                print 'Could not fetch feed: %s\nError: %s'%(url, err)
+                continue
+            
+            articles[title] = []
+            soup = BeautifulStoneSoup(src)
+            for item in soup.findAll('item'):
+                try:
+                    pubdate = item.find('pubdate').string
+                    if not pubdate:
+                        continue
+                    pubdate = pubdate.replace('+0000', 'GMT')
+                    d = { 
+                        'title'    : item.find('title').string,                 
+                        'url'      : self.print_version(item.find('guid').string),
+                        'timestamp': calendar.timegm(self.strptime(pubdate)),
+                        'date'     : pubdate
+                        }
+                    delta = time.time() - d['timestamp']
+                    if delta > self.oldest_article*3600*24:
+                        continue
+                     
+                except Exception, err:
+                    continue
+                try:
+                    desc = item.find('description')
+                    d['description'] = self.process_html_description(desc) if  self.html_description else desc.string                    
+                except:
+                    d['description'] = ''
+                articles[title].append(d)
+            articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
+            articles[title][self.max_articles_per_feed:] = []
+            for item in articles[title]:
+                item.pop('timestamp')
+            if not articles[title]:
+                articles.pop(title)
+        return articles
+
+    
+    @classmethod
+    def process_html_description(cls, tag):
+        src = '\n'.join(tag.contents)
+        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
+        for e in replaced_entities:
+            ent = '&'+e+';'
+            src = src.replace(ent, unichr(name2codepoint[e]))
+        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
+
+    
+    DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
+    MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
+    FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, 
+                      July=7, August=8, September=9, October=10, 
+                      November=11, December=12)
+        
+    @classmethod
+    def strptime(cls, src):
+        src = src.strip().split()
+        src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
+        try:
+            src[2] = str(cls.MONTH_MAP[src[2]])
+        except KeyError:
+            src[2] = str(cls.FULL_MONTH_MAP[src[2]])
+        return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
+    
+    def command_line_options(self):
+        args = []
+        args.append('--max-recursions='+str(self.max_recursions))
+        args.append('--delay='+str(self.delay))
+        args.append('--max-files='+str(self.max_files))
+        for i in self.match_regexps:
+            args.append('--match-regexp="'+i+'"')
+        for i in self.filter_regexps:
+            args.append('--filter-regexp="'+i+'"')
+        return args
+        
+    
--- a/src/libprs500/ebooks/lrf/web/profiles/bbc.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/bbc.py
@ -0,0 +1,57 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Fetch the BBC.
+'''
+import re
+
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class BBC(DefaultProfile):
+    
+    title = 'The BBC'
+    max_recursions = 2
+    timefmt  = ' [%a, %d %b, %Y]'
+    no_stylesheets = True
+    
+    preprocess_regexps = \
+        [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+              [
+               # Remove footer from individual stories
+               (r'<div class=.footer.>.*?Published', 
+                lambda match : '<p></p><div class="footer">Published'),
+               # Add some style info in place of disabled stylesheet
+               (r'<link.*?type=.text/css.*?>', lambda match :
+                '''<style type="text/css">
+                    .headline {font-size: x-large;}
+                    .fact { padding-top: 10pt  }
+                    </style>'''),
+               ]
+                  ]
+    
+        
+    def print_version(self, url):
+        return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
+    
+    def get_feeds(self):
+        src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
+        soup = BeautifulSoup(src[src.index('<html'):])
+        feeds = []
+        ul =  soup.find('ul', attrs={'class':'rss'})
+        for link in ul.findAll('a'):
+            feeds.append((link.string, link['href']))
+        return feeds
+
--- a/src/libprs500/ebooks/lrf/web/profiles/economist.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/economist.py
@ -0,0 +1,81 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Fetch The Economist.
+'''
+import re
+
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class Economist(DefaultProfile):
+    
+    title = 'The Economist'
+    timefmt = ' [%d %b %Y]'
+    max_recursions = 3
+    
+    TITLES = [
+          'The world this week',
+          'Letters',
+          'Briefings',
+          'Special reports',
+          'Britain',
+          'Europe',
+          'United States',
+          'The Americas',
+          'Middle East and Africa',
+          'Asia',
+          'International',
+          'Business',
+          'Finance and economics',
+          'Science and technology',
+          'Books and arts',
+          'Indicators'
+          ]
+    
+    preprocess_regexps = \
+        [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+            [
+             # Remove advert
+             (r'<noscript.*?</noscript>', lambda match: ''),
+             ]
+            ]
+    
+    def __init__(self, username=None, password=None):
+        DefaultProfile.__init__(self, username, password)
+        self.browser = None # Needed as otherwise there are timeouts while fetching actual articles
+    
+    def print_version(self, url):
+        return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
+    
+    def get_feeds(self):
+        src = self.browser.open('http://economist.com/rss/').read()
+        soup = BeautifulSoup(src)
+        feeds = []
+        for ul in soup.findAll('ul'):
+            lis =  ul.findAll('li')
+            try:
+                title, link = lis[0], lis[1]
+            except IndexError:
+                continue
+            title = title.string
+            if title:
+                title = title.strip()
+            if title not in self.__class__.TITLES:
+                continue
+            a = link.find('a')
+            feeds.append((title, a['href'].strip()))
+            
+        return feeds
--- a/src/libprs500/ebooks/lrf/web/profiles/newsweek.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/newsweek.py
@ -12,14 +12,27 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
+'''
+Profile to download Newsweek
+'''
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile

-import tempfile, os, shutil
-
-from libprs500.ebooks.lrf.web import build_index, parse_feeds
-from libprs500 import __appname__, iswindows, browser
-
-RSS_FEEDS = [
+class Newsweek(DefaultProfile):
+    
+    title = 'Newsweek'
+    max_recursions = 2
+    timefmt  = ' [%d %b %Y]'
+    html_description = True
+    oldest_article        = 15
+    
+        
+    def print_version(self, url):
+        if not url.endswith('/'):
+            url += '/'
+        return url + 'output/print'
+    
+    def get_feeds(self):
+        return [
             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
             ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
             ('Politics', 'http://feeds.newsweek.com/headlines/politics'),
@ -32,28 +45,5 @@ RSS_FEEDS = [
             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
             ]
-
-
-def print_version(url):
-    if '?' in url:
-        url = url[:url.index('?')]
-    if not url.endswith('/'):
-        url += '/'
-    return url + 'output/print'
-
-def initialize(profile):
-    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
-    profile['browser'] = browser()
-    articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, 
-                           max_articles_per_feed=20, oldest_article=15, 
-                           html_description=True)
-    index = build_index('Newsweek', articles, profile['temp dir'])
-    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
-    profile['timefmt'] = ' [%d %b %Y]'
-    profile['max_recursions'] =  2
-    profile['title']          = 'Newsweek'
-    profile['url'] = 'file:'+ ('' if iswindows else '//') +index
-
-def finalize(profile):
-    if os.path.isdir(profile['temp dir']):
-        shutil.rmtree(profile['temp dir'])
+        
+        
--- a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
@ -0,0 +1,66 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Profile to download the New York Times
+'''
+import re
+
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class NYTimes(DefaultProfile):
+    
+    title   = 'The New York Times'
+    timefmt = ' [%a, %d %b, %Y]'
+    max_recursions = 2
+    
+    preprocess_regexps = \
+            [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+              [
+               # Remove header bar
+               (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
+               (r'<div class="articleTools">.*></ul>', lambda match : ''),
+               # Remove footer bar
+               (r'<\!--  end \#article -->.*', lambda match : '</body></html>'),
+               (r'<div id="footer">.*', lambda match : '</body></html>'),
+               ]
+              ]
+              
+    def browser(self):
+        br = DefaultProfile.browser(self)
+        if self.username is not None and self.password is not None:
+            br.open('http://www.nytimes.com/auth/login')
+            br.select_form(name='login')
+            br['USERID']   = self.username
+            br['PASSWORD'] = self.password
+            br.submit()
+        return br
+    
+    def get_feeds(self):
+        src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
+        soup = BeautifulSoup(src[src.index('<html'):])
+        feeds = []
+        for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
+            if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
+                                     'Dining & Wine', 'Home & Garden', 'Multimedia',
+                                     'Most E-mailed Articles', 
+                                     'Automobiles', 'Fashion & Style', 'Television News',
+                                     'Education']:
+                feeds.append((link['title'], link['href']))            
+        
+        return feeds
+    
+    def print_version(self, url):
+        return url + '?&pagewanted=print'