Add support for user profiles in web2lrf

2025-10-31 02:27:01 -04:00 · 2007-10-27 00:34:58 +00:00 · 2007-10-27 00:34:58 +00:00 · 8799a6f3f2
commit 8799a6f3f2
parent 275b59a2e7
11 changed files with 500 additions and 535 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -13,134 +13,3 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import os, time, calendar, operator, re
 from libprs500 import iswindows
 from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 from htmlentitydefs import name2codepoint
 DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
 MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
 FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, 
                      July=7, August=8, September=9, October=10, 
                      November=11, December=12)
 def strptime(src):
    src = src.strip().split()
    src[0] = str(DAY_MAP[src[0][:-1]])+','
    try:
        src[2] = str(MONTH_MAP[src[2]])
    except KeyError:
        src[2] = str(FULL_MONTH_MAP[src[2]])
    return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
 def process_html_description(tag):
        src = '\n'.join(tag.contents)
        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
        for e in replaced_entities:
            ent = '&'+e+';'
            src = src.replace(ent, unichr(name2codepoint[e]))
        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
 def parse_feeds(feeds, browser, print_version, 
                max_articles_per_feed=10, 
                html_description=False,
                oldest_article=7):
    '''
    @param print_version: Callable that takes a url string and returns the url to 
                          printable version of the article pointed to by the original url.
    @param max_articles_per_feed: Maximum number of articles to download from each feed
    @param html_description: If true the atricles descriptions are processed as HTML
    @param oldest_article: A number in days. No articles older than now - oldest_aticle 
                           will be downloaded.  
    '''
    articles = {}
    for title, url in feeds:
        try:
            src = browser.open(url).read()
        except Exception, err:
            print 'Could not fetch feed: %s\nError: %s'%(url, err)
            continue
        articles[title] = []
        soup = BeautifulStoneSoup(src)
        for item in soup.findAll('item'):
            try:
                pubdate = item.find('pubdate').string
                if not pubdate:
                    continue
                pubdate = pubdate.replace('+0000', 'GMT')
                d = { 
                    'title'    : item.find('title').string,                 
                    'url'      : print_version(item.find('guid').string),
                    'timestamp': calendar.timegm(strptime(pubdate)),
                    'date'     : pubdate
                    }
                delta = time.time() - d['timestamp']
                if delta > oldest_article*3600*24:
                    continue
            except Exception, err:
                continue
            try:
                desc = item.find('description')
                d['description'] = process_html_description(desc) if  html_description else desc.string                    
            except:
                d['description'] = ''
            articles[title].append(d)
        articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
        articles[title][max_articles_per_feed:] = []
        for item in articles[title]:
            item.pop('timestamp')
        if not articles[title]:
            articles.pop(title)
    return articles
 def build_index(title, articles, dir):
    '''Build an RSS based index.html'''
    def build_sub_index(title, items):
        ilist = ''
        li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
            u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
        for item in items:
            ilist += li%item
        return u'''\
        <html>
        <body>
        <h2>%(title)s</h2>
        <ul>
        %(items)s
        </ul>
        </body>
        </html>
        '''%dict(title=title, items=ilist.rstrip())        
    cnum = 0
    clist = ''
    categories = articles.keys()
    categories.sort()
    for category in categories:
        cnum  += 1
        cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
        prefix = 'file:' if iswindows else ''
        clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
        src = build_sub_index(category, articles[category])
        open(cfile, 'wb').write(src.encode('utf-8'))        
    src = '''\
    <html>
    <body>
    <h1>%(title)s</h1>
    <div style='text-align: right; font-weight: bold'>%(date)s</div>
    <ul>
    %(categories)s
    </ul>
    </body>
    </html>
    '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), 
             categories=clist, title=title)
    index = os.path.join(dir, 'index.html')
    open(index, 'wb').write(src.encode('utf-8'))
    return index
--- a/src/libprs500/ebooks/lrf/web/bbc.py
+++ b/src/libprs500/ebooks/lrf/web/bbc.py
@ -1,53 +0,0 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import tempfile, shutil, os
 from libprs500.ebooks.lrf.web import build_index, parse_feeds
 RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm'
 from libprs500 import __appname__, iswindows, browser
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 def get_feeds(browser):
    src = browser.open(RSS).read()
    soup = BeautifulSoup(src[src.index('<html'):])
    feeds = []
    ul =  soup.find('ul', attrs={'class':'rss'})
    for link in ul.findAll('a'):
        feeds.append((link.string, link['href']))
    return feeds
 def initialize(profile):
    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
    profile['browser'] = browser()
    feeds = get_feeds(profile['browser'])
    articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/'))
    index = build_index('The BBC', articles, profile['temp dir'])
    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
    profile['timefmt'] = ' [%a, %d %b, %Y]'
    profile['max_recursions'] =  2                 
    profile['title']          = 'The BBC'
    profile['no_stylesheets'] = True
 def finalize(profile):
    if os.path.isdir(profile['temp dir']):
        shutil.rmtree(profile['temp dir'])
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -14,43 +14,48 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Convert known websites into LRF files.'''
-import sys, time, tempfile, shutil, os, logging
+import sys, time, tempfile, shutil, os, logging, imp, inspect
 from urlparse import urlsplit
 from libprs500 import __appname__, setup_cli_handlers, CommandLineError
 from libprs500.ebooks.lrf import option_parser as lrf_option_parser
 from libprs500.ebooks.lrf.html.convert_from import process_file
-from libprs500.ebooks.lrf.web.profiles import profiles
+
 from libprs500.web.fetch.simple import create_fetcher
-available_profiles = profiles.keys()
+from libprs500.ebooks.lrf.web.profiles import DefaultProfile
-available_profiles.remove('default')
+from libprs500.ebooks.lrf.web.profiles.nytimes import NYTimes
-available_profiles = ' '.join(available_profiles)
+from libprs500.ebooks.lrf.web.profiles.bbc import BBC
 from libprs500.ebooks.lrf.web.profiles.newsweek import Newsweek
 builtin_profiles   = [NYTimes, BBC, Newsweek]
 available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] 
 def option_parser():
    parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
                          '''%prog downloads a site from the web and converts it '''
                          '''into a LRF file for use with the SONY Reader. '''
-                          '''website_profile is one of '''+available_profiles+\
+                          '''website_profile is one of '''+str(available_profiles)+\
                          ''' If you specify a website_profile of default or do not specify '''
                          '''it, you must specify the --url option.'''
                          )
    parser.add_option('-u', '--url', dest='url', default=None,  
                      help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
-    
+    parser.add_option('--user-profile', default=None,
                      help='Path to a python file containing a user created profile.')
    parser.add_option('--username', dest='username', default=None, 
                      help='Specify the username to be used while downloading. Only used if the profile supports it.')
    parser.add_option('--password', dest='password', default=None,
                      help='Specify the password to be used while downloading. Only used if the profile supports it.')
-    parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %(timeout)s s'%profiles['default'],
+    parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
                      default=None, type='int', dest='timeout')
-    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %(max_recursions)s'%profiles['default'],
+    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
                      default=None, type='int', dest='max_recursions')
    parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
-                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %(max_files)s'%profiles['default'])
+                      help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
    parser.add_option('--delay', default=None, dest='delay', type='int',
-                      help='Minimum interval in seconds between consecutive fetches. Default is %(delay)s s'%profiles['default'])
+                      help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
    parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
                      help='Do not download CSS stylesheets.', dest='no_stylesheets')    
@ -85,45 +90,58 @@ def process_profile(args, options, logger=None):
            level = logging.DEBUG if options.verbose else logging.INFO
            logger = logging.getLogger('web2lrf')
            setup_cli_handlers(logger, level)
        index = -1
        if options.user_profile is not None:
            path = os.path.abspath(options.user_profile)
            name = os.path.splitext(os.path.basename(path))[0]
            res = imp.find_module(name, [os.path.dirname(path)])
            module =  imp.load_module(name, *res)
            classes = inspect.getmembers(module, 
                lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
                           and x is not DefaultProfile)
            if not classes:
                raise CommandLineError('Invalid user profile '+path)
            builtin_profiles.append(classes[0][1])
            available_profiles.append(name)
            if len(args) < 2:
                args.append('')
            args[1] = name
        if len(args) == 2:
-            if not profiles.has_key(args[1]):
+            try:
-                raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
+                index = available_profiles.index(args[1])
-        profile = profiles[args[1]] if len(args) == 2 else profiles['default']
+            except ValueError:
-        profile['username'] = options.username
+                raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
-        profile['password'] = options.password
+        profile = DefaultProfile if index == -1 else builtin_profiles[index]
-        if profile.has_key('initialize'):
+        profile = profile(options.username, options.password)
-            profile['initialize'](profile)
+        if profile.browser is not None:
-        if profile.has_key('browser'):
+            options.browser = profile.browser
            options.browser = profile['browser']
        for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
            val = getattr(options, opt)
            if val is None:
-                setattr(options, opt, profile[opt])
+                setattr(options, opt, getattr(profile, opt))
        if not options.url:
-            options.url = profile['url']            
+            options.url = profile.url            
        if not options.url:
            raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
        if not options.title:
-            title = profile['title']
+            title = profile.title
            if not title:
                title = urlsplit(options.url).netloc
-            options.title = title + time.strftime(profile['timefmt'], time.localtime())
+            options.title = title + time.strftime(profile.timefmt, time.localtime())
-        options.match_regexps += profile['match_regexps']
+        options.match_regexps += profile.match_regexps
-        options.preprocess_regexps = profile['preprocess_regexps']
+        options.preprocess_regexps = profile.preprocess_regexps
-        options.filter_regexps += profile['filter_regexps']
+        options.filter_regexps += profile.filter_regexps
        if len(args) == 2 and args[1] != 'default':
            options.anchor_ids = False
        htmlfile, tdir = fetch_website(options, logger)
        create_lrf(htmlfile, options, logger)
    finally:
        if profile.has_key('finalize'):
            profile['finalize'](profile)
        if tdir and os.path.isdir(tdir):
            shutil.rmtree(tdir)
--- a/src/libprs500/ebooks/lrf/web/economist.py
+++ b/src/libprs500/ebooks/lrf/web/economist.py
@ -1,81 +0,0 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import tempfile, shutil, os
 from libprs500.ebooks.lrf.web import build_index, parse_feeds
 RSS = 'http://economist.com/rss/'
 TITLES = [
          'The world this week',
          'Letters',
          'Briefings',
          'Special reports',
          'Britain',
          'Europe',
          'United States',
          'The Americas',
          'Middle East and Africa',
          'Asia',
          'International',
          'Business',
          'Finance and economics',
          'Science and technology',
          'Books and arts',
          'Indicators'
          ]
 from libprs500 import __appname__, iswindows, browser
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 def print_version(url):
    return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
 def get_feeds(browser):
    src = browser.open(RSS).read()
    soup = BeautifulSoup(src)
    feeds = []
    for ul in soup.findAll('ul'):
        lis =  ul.findAll('li')
        try:
            title, link = lis[0], lis[1]
        except IndexError:
            continue
        title = title.string
        if title:
            title = title.strip()
        if title not in TITLES:
            continue
        a = link.find('a')
        feeds.append((title, a['href'].strip()))
    return feeds
 def initialize(profile):
    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
    profile['browser'] = browser()
    feeds = get_feeds(profile['browser'])
    articles = parse_feeds(feeds, profile['browser'], print_version, max_articles_per_feed=20)
    index = build_index('The Economist', articles, profile['temp dir'])
    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
    profile['timefmt'] = ' [%d %b %Y]'
    profile['max_recursions'] =  3                
    profile['title']          = 'The Economist'
    profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
 def finalize(profile):
    if os.path.isdir(profile['temp dir']):
        shutil.rmtree(profile['temp dir'])
--- a/src/libprs500/ebooks/lrf/web/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/nytimes.py
@ -1,73 +0,0 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''New York Times from RSS feeds.'''
 import os, tempfile, shutil
 from libprs500 import __appname__, iswindows, browser
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 from libprs500.ebooks.lrf.web import build_index, parse_feeds
 RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
 LOGIN = 'http://www.nytimes.com/auth/login'
 def get_feeds(browser):
    src = browser.open(RSS).read()
    soup = BeautifulSoup(src[src.index('<html'):])
    feeds = []
    for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
        if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
                                 'Dining & Wine', 'Home & Garden', 'Multimedia',
                                 'Most E-mailed Articles', 
                                 'Automobiles', 'Fashion & Style', 'Television News',
                                 'Education']:
            feeds.append((link['title'], link['href']))
        #else: print link['title']
    return feeds
 def initialize(profile):
    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
    profile['browser'] = login(profile)
    feeds = get_feeds(profile['browser'])
    articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print',
                           oldest_article=2)
    index = build_index('The New York Times', articles, profile['temp dir'])
    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
    profile['timefmt'] = ' [%a, %d %b, %Y]'
    profile['max_recursions'] =  2                 
    profile['title']          = 'The New York Times'
 def finalize(profile):
    if os.path.isdir(profile['temp dir']):
        shutil.rmtree(profile['temp dir'])
 def login(profile):
    br = browser()
    if profile['username'] and profile['password']:
        br.open(LOGIN)
        br.select_form(name='login')
        br['USERID']   = profile['username']
        br['PASSWORD'] = profile['password']
        br.submit()
    return br
 if __name__ == '__main__':
    feeds = get_feeds()
    articles = parse_feeds(feeds)
    print articles
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@ -1,136 +0,0 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Profiles for known websites.'''
 import re
 from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
 from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
 from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
 from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
 from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize
 from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize
 from libprs500.ebooks.lrf.web.economist import initialize as economist_initialize
 from libprs500.ebooks.lrf.web.economist import finalize as economist_finalize
 profiles = {
            'default' : {
                         'url'               : '',    # The URL of the website
                         'title'             : '',    # The title to use for the LRF file
                         'max_recursions'    : 1,     # Number of levels of links to follow
                         'max_files'         : 1000,  # Maximum number of files to download
                         'delay'             : 0,     # Delay between consecutive downloads
                         'timeout'           : 10,    # Timeout for fetching files from server in seconds
                         'timefmt'           : ' [%a %d %b %Y]',
                         'no_stylesheets'    : False, # Download stylesheets 
                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
                         'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore
                         # Only one of match_regexps or filter_regexps should be defined
                         'html2lrf_options'  : [],    # List of options to pass to html2lrf
                         'preprocess_regexps': [],    # List of regexp substitution rules to run on the downloaded HTML before running html2lrf
                         # See the profiles below for examples of these settings. 
                       },
            'nytimes' : {
                         'initialize'          : nytimes_initialize,
                         'finalize'            : nytimes_finalize,
                         'preprocess_regexps' :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
                           # Remove header bar
                           (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
                           (r'<div class="articleTools">.*></ul>', lambda match : ''),
                           # Remove footer bar
                           (r'<\!--  end \#article -->.*', lambda match : '</body></html>'),
                           (r'<div id="footer">.*', lambda match : '</body></html>'),
                           ]
                          ],
                         },
            'bbc'     : {
                          'initialize'          : bbc_initialize,
                          'finalize'            : bbc_finalize,
                          'preprocess_regexps' :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
                           # Remove footer from individual stories
                           (r'<div class=.footer.>.*?Published', 
                            lambda match : '<p></p><div class="footer">Published'),
                           # Add some style info in place of disabled stylesheet
                           (r'<link.*?type=.text/css.*?>', lambda match :
                            '''<style type="text/css">
                                .headline {font-size: x-large;}
                                .fact { padding-top: 10pt  }
                                </style>'''),
                           ]
                          ],
                          },
            'newsweek' : {
                          'initialize'          : newsweek_initialize,
                          'finalize'            : newsweek_finalize,
                          'preprocess_regexps'  :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
                           # Make fonts larger
                           (r'<style.*?\.copyright.*?</style>', 
                            lambda match : \
                        '''<style type="text/css">'''
                        '''updateTime{font:small Arial;color:#000000;}'''
                        '''.credit{font:small Arial;color:#999999;}'''
                        '''.head{font:bold 18pt x-large;color:#CC0000;}'''
                        '''.abstract{font:14pt large Verdana;color:#000000;}'''
                        '''.title{font:bold;color:#000000;}'''
                        '''.source{font:bold small Verdana;color:#CC0000;}'''
                        '''.footerLink{font:bold Verdana;color:#000000;}'''
                        '''.caption{font: Verdana;color:#000000;}'''
                        '''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
                        '''.copyright{font-style:italic;}'''
                        '''</style>'''
                            ),
                           ]
                          ],
                          }, 
            'economist' : {
                           'initialize'          : economist_initialize,
                           'finalize'            : economist_finalize,
                           'preprocess_regexps' :
                           [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                            [
                             # Remove advert
                             (r'<noscript.*?</noscript>', lambda match: ''),
                             ]
                            ], 
                           },                                   
            }
 for key in profiles.keys():
    if key == 'default':
        continue
    newd = profiles['default'].copy()
    newd.update(profiles[key])
    profiles[key] = newd
 def profile_to_command_line_options(profile):
    args = []
    args.append('--max-recursions='+str(profile['max_recursions']))
    args.append('--delay='+str(profile['delay']))
    for i in profile['match_regexps']:
        args.append('--match-regexp="'+i+'"')
    for i in profile['filter_regexps']:
        args.append('--filter-regexp="'+i+'"')
    return args
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -0,0 +1,227 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 '''
 import tempfile, time, calendar, re, operator
 from htmlentitydefs import name2codepoint
 from libprs500 import __appname__, iswindows, browser
 from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 class DefaultProfile(object):
    url                   = ''    # The URL of the website
    title                 = 'Default Profile'    # The title to use for the LRF file
    max_articles_per_feed = 10    # Maximum number of articles to download from each feed 
    html_description      = False # If True process the <description> element of the feed as HTML
    oldest_article        = 7     # How many days old should the oldest article downloaded from the feeds be?
    max_recursions        = 1     # Number of levels of links to follow
    max_files             = 3000  # Maximum number of files to download
    delay                 = 0     # Delay between consecutive downloads
    timeout               = 10    # Timeout for fetching files from server in seconds
    timefmt               = ' [%a %d %b %Y]' # The format of the date shown on the first page
    no_stylesheets        = False # Download stylesheets only if False 
    match_regexps         = []    # List of regular expressions that determines which links to follow
    filter_regexps        = []    # List of regular expressions that determines which links to ignore
    # Only one of match_regexps or filter_regexps should be defined
    html2lrf_options   = []    # List of options to pass to html2lrf
    # List of regexp substitution rules to run on the downloaded HTML. Each element of the 
    # list should be a two element tuple. THe first element of the tuple should
    # be a compiled regular expression and the second a callable that takes
    # a single match object and returns a string to replace the match.
    preprocess_regexps = []
    # See the built-in profiles for examples of these settings.
    def get_feeds(self):
        '''
        Return a list of RSS feeds to fetch for this profile. Each element of the list
        must be a 2-element tuple of the form (title, url).
        '''
        raise NotImplementedError
    @classmethod
    def print_version(cls, url):
        '''
        Takea a URL pointing to an article and returns the URL pointing to the
        print version of the article.
        '''
        return url
    @classmethod
    def get_browser(cls):
        '''
        Return a browser instance used to fetch documents from the web.
        If your profile requires that you login first, override this method
        in your subclass. See for example the nytimes profile.
        '''
        return browser()
    ########################################################################
    ###################### End of customizable portion #####################
    ########################################################################
    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
        self.temp_dir = tempfile.mkdtemp(prefix=__appname__+'_')
        self.browser = self.get_browser()
        self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
    def __del__(self):
        import os, shutil
        if os.path.isdir(self.temp_dir):
            shutil.rmtree(self.temp_dir)
    def build_index(self):
        '''Build an RSS based index.html'''
        import os
        articles = self.parse_feeds()
        def build_sub_index(title, items):
            ilist = ''
            li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
                u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
            for item in items:
                ilist += li%item
            return u'''\
            <html>
            <body>
            <h2>%(title)s</h2>
            <ul>
            %(items)s
            </ul>
            </body>
            </html>
            '''%dict(title=title, items=ilist.rstrip())        
        cnum = 0
        clist = ''
        categories = articles.keys()
        categories.sort()
        for category in categories:
            cnum  += 1
            cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
            prefix = 'file:' if iswindows else ''
            clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
            src = build_sub_index(category, articles[category])
            open(cfile, 'wb').write(src.encode('utf-8'))        
        src = '''\
        <html>
        <body>
        <h1>%(title)s</h1>
        <div style='text-align: right; font-weight: bold'>%(date)s</div>
        <ul>
        %(categories)s
        </ul>
        </body>
        </html>
        '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), 
                 categories=clist, title=self.title)
        index = os.path.join(self.temp_dir, 'index.html')
        open(index, 'wb').write(src.encode('utf-8'))
        return index
    def parse_feeds(self):
        feeds = self.get_feeds()
        articles = {}
        for title, url in feeds:
            try:
                src = self.browser.open(url).read()
            except Exception, err:
                print 'Could not fetch feed: %s\nError: %s'%(url, err)
                continue
            articles[title] = []
            soup = BeautifulStoneSoup(src)
            for item in soup.findAll('item'):
                try:
                    pubdate = item.find('pubdate').string
                    if not pubdate:
                        continue
                    pubdate = pubdate.replace('+0000', 'GMT')
                    d = { 
                        'title'    : item.find('title').string,                 
                        'url'      : self.print_version(item.find('guid').string),
                        'timestamp': calendar.timegm(self.strptime(pubdate)),
                        'date'     : pubdate
                        }
                    delta = time.time() - d['timestamp']
                    if delta > self.oldest_article*3600*24:
                        continue
                except Exception, err:
                    continue
                try:
                    desc = item.find('description')
                    d['description'] = self.process_html_description(desc) if  self.html_description else desc.string                    
                except:
                    d['description'] = ''
                articles[title].append(d)
            articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
            articles[title][self.max_articles_per_feed:] = []
            for item in articles[title]:
                item.pop('timestamp')
            if not articles[title]:
                articles.pop(title)
        return articles
    @classmethod
    def process_html_description(cls, tag):
        src = '\n'.join(tag.contents)
        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
        for e in replaced_entities:
            ent = '&'+e+';'
            src = src.replace(ent, unichr(name2codepoint[e]))
        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
    DAY_MAP   = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
    MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
    FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, 
                      July=7, August=8, September=9, October=10, 
                      November=11, December=12)
    @classmethod
    def strptime(cls, src):
        src = src.strip().split()
        src[0] = str(cls.DAY_MAP[src[0][:-1]])+','
        try:
            src[2] = str(cls.MONTH_MAP[src[2]])
        except KeyError:
            src[2] = str(cls.FULL_MONTH_MAP[src[2]])
        return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
    def command_line_options(self):
        args = []
        args.append('--max-recursions='+str(self.max_recursions))
        args.append('--delay='+str(self.delay))
        args.append('--max-files='+str(self.max_files))
        for i in self.match_regexps:
            args.append('--match-regexp="'+i+'"')
        for i in self.filter_regexps:
            args.append('--filter-regexp="'+i+'"')
        return args
--- a/src/libprs500/ebooks/lrf/web/profiles/bbc.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/bbc.py
@ -0,0 +1,57 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Fetch the BBC.
 '''
 import re
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 class BBC(DefaultProfile):
    title = 'The BBC'
    max_recursions = 2
    timefmt  = ' [%a, %d %b, %Y]'
    no_stylesheets = True
    preprocess_regexps = \
        [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
              [
               # Remove footer from individual stories
               (r'<div class=.footer.>.*?Published', 
                lambda match : '<p></p><div class="footer">Published'),
               # Add some style info in place of disabled stylesheet
               (r'<link.*?type=.text/css.*?>', lambda match :
                '''<style type="text/css">
                    .headline {font-size: x-large;}
                    .fact { padding-top: 10pt  }
                    </style>'''),
               ]
                  ]
    def print_version(self, url):
        return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
    def get_feeds(self):
        src = self.browser.open('http://news.bbc.co.uk/1/hi/help/3223484.stm').read()
        soup = BeautifulSoup(src[src.index('<html'):])
        feeds = []
        ul =  soup.find('ul', attrs={'class':'rss'})
        for link in ul.findAll('a'):
            feeds.append((link.string, link['href']))
        return feeds
--- a/src/libprs500/ebooks/lrf/web/profiles/economist.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/economist.py
@ -0,0 +1,81 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Fetch The Economist.
 '''
 import re
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 class Economist(DefaultProfile):
    title = 'The Economist'
    timefmt = ' [%d %b %Y]'
    max_recursions = 3
    TITLES = [
          'The world this week',
          'Letters',
          'Briefings',
          'Special reports',
          'Britain',
          'Europe',
          'United States',
          'The Americas',
          'Middle East and Africa',
          'Asia',
          'International',
          'Business',
          'Finance and economics',
          'Science and technology',
          'Books and arts',
          'Indicators'
          ]
    preprocess_regexps = \
        [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
            [
             # Remove advert
             (r'<noscript.*?</noscript>', lambda match: ''),
             ]
            ]
    def __init__(self, username=None, password=None):
        DefaultProfile.__init__(self, username, password)
        self.browser = None # Needed as otherwise there are timeouts while fetching actual articles
    def print_version(self, url):
        return url.replace('displaystory', 'PrinterFriendly').replace('&fsrc=RSS', '')
    def get_feeds(self):
        src = self.browser.open('http://economist.com/rss/').read()
        soup = BeautifulSoup(src)
        feeds = []
        for ul in soup.findAll('ul'):
            lis =  ul.findAll('li')
            try:
                title, link = lis[0], lis[1]
            except IndexError:
                continue
            title = title.string
            if title:
                title = title.strip()
            if title not in self.__class__.TITLES:
                continue
            a = link.find('a')
            feeds.append((title, a['href'].strip()))
        return feeds
--- a/src/libprs500/ebooks/lrf/web/profiles/newsweek.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/newsweek.py
@ -12,14 +12,27 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
+'''
 Profile to download Newsweek
 '''
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
-import tempfile, os, shutil
+class Newsweek(DefaultProfile):
-from libprs500.ebooks.lrf.web import build_index, parse_feeds
+    title = 'Newsweek'
-from libprs500 import __appname__, iswindows, browser
+    max_recursions = 2
    timefmt  = ' [%d %b %Y]'
    html_description = True
    oldest_article        = 15
-RSS_FEEDS = [
+        
    def print_version(self, url):
        if not url.endswith('/'):
            url += '/'
        return url + 'output/print'
    def get_feeds(self):
        return [
             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
             ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
             ('Politics', 'http://feeds.newsweek.com/headlines/politics'),
@ -34,26 +47,3 @@ RSS_FEEDS = [
             ]
 def print_version(url):
    if '?' in url:
        url = url[:url.index('?')]
    if not url.endswith('/'):
        url += '/'
    return url + 'output/print'
 def initialize(profile):
    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
    profile['browser'] = browser()
    articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, 
                           max_articles_per_feed=20, oldest_article=15, 
                           html_description=True)
    index = build_index('Newsweek', articles, profile['temp dir'])
    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
    profile['timefmt'] = ' [%d %b %Y]'
    profile['max_recursions'] =  2
    profile['title']          = 'Newsweek'
    profile['url'] = 'file:'+ ('' if iswindows else '//') +index
 def finalize(profile):
    if os.path.isdir(profile['temp dir']):
        shutil.rmtree(profile['temp dir'])
--- a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
@ -0,0 +1,66 @@
 ##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Profile to download the New York Times
 '''
 import re
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 class NYTimes(DefaultProfile):
    title   = 'The New York Times'
    timefmt = ' [%a, %d %b, %Y]'
    max_recursions = 2
    preprocess_regexps = \
            [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
              [
               # Remove header bar
               (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
               (r'<div class="articleTools">.*></ul>', lambda match : ''),
               # Remove footer bar
               (r'<\!--  end \#article -->.*', lambda match : '</body></html>'),
               (r'<div id="footer">.*', lambda match : '</body></html>'),
               ]
              ]
    def browser(self):
        br = DefaultProfile.browser(self)
        if self.username is not None and self.password is not None:
            br.open('http://www.nytimes.com/auth/login')
            br.select_form(name='login')
            br['USERID']   = self.username
            br['PASSWORD'] = self.password
            br.submit()
        return br
    def get_feeds(self):
        src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
        soup = BeautifulSoup(src[src.index('<html'):])
        feeds = []
        for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
            if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
                                     'Dining & Wine', 'Home & Garden', 'Multimedia',
                                     'Most E-mailed Articles', 
                                     'Automobiles', 'Fashion & Style', 'Television News',
                                     'Education']:
                feeds.append((link['title'], link['href']))            
        return feeds
    def print_version(self, url):
        return url + '?&pagewanted=print'