New BBC profile.

2026-05-31 02:55:19 -04:00 · 2007-08-18 17:06:00 +00:00
parent ef0ec5bd71
commit 684d03da1f
5 changed files with 207 additions and 138 deletions
@@ -12,3 +12,88 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+import os, time, calendar, operator
+
+from libprs500 import iswindows
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+
+def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
+    articles = {}
+    for title, url in feeds:
+        src = browser.open(url).read()
+        articles[title] = []
+        soup = BeautifulStoneSoup(src)
+        for item in soup.findAll('item'):
+            try:
+                pubdate = item.find('pubdate').string
+                if not pubdate:
+                    continue
+                pubdate = pubdate.replace('+0000', 'GMT')
+                d = { 
+                    'title'    : item.find('title').string,                 
+                    'url'      : print_version(item.find('guid').string),
+                    'timestamp': calendar.timegm(time.strptime(pubdate, 
+                                                    '%a, %d %b %Y %H:%M:%S %Z')),
+                    'date'     : pubdate
+                    }
+            except:
+                continue
+            try:
+                d['description'] = item.find('description').string
+            except:
+                d['description'] = ''
+            articles[title].append(d)
+        articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
+        articles[title][max_articles_per_feed:] = []
+        for item in articles[title]:
+            item.pop('timestamp')
+    return articles
+
+
+def build_index(title, articles, dir):
+    '''Build an RSS based index.html'''
+
+    def build_sub_index(title, items):
+        ilist = ''
+        li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
+            u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
+        for item in items:
+            ilist += li%item
+        return u'''\
+        <html>
+        <body>
+        <h2>%(title)s</h2>
+        <ul>
+        %(items)s
+        </ul>
+        </body>
+        </html>
+        '''%dict(title=title, items=ilist.rstrip())        
+    
+    cnum = 0
+    clist = ''
+    categories = articles.keys()
+    categories.sort()
+    for category in categories:
+        cnum  += 1
+        cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
+        prefix = 'file:' if iswindows else ''
+        clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
+        src = build_sub_index(category, articles[category])
+        open(cfile, 'wb').write(src.encode('utf-8'))        
+    
+    src = '''\
+    <html>
+    <body>
+    <h1>The New York Times</h1>
+    <div style='text-align: right; font-weight: bold'>%(date)s</div>
+    <ul>
+    %(categories)s
+    </ul>
+    </body>
+    </html>
+    '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist)
+    index = os.path.join(dir, 'index.html')
+    open(index, 'wb').write(src.encode('utf-8'))
+    return index
@@ -0,0 +1,53 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+
+import tempfile, shutil, os
+from libprs500.ebooks.lrf.web import build_index, parse_feeds
+
+RSS = 'http://news.bbc.co.uk/1/hi/help/3223484.stm'
+
+from libprs500 import __appname__, iswindows, browser
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+
+def get_feeds(browser):
+    src = browser.open(RSS).read()
+    soup = BeautifulSoup(src[src.index('<html'):])
+    feeds = []
+    ul =  soup.find('ul', attrs={'class':'rss'})
+    for link in ul.findAll('a'):
+        feeds.append((link.string, link['href']))
+    return feeds
+
+def initialize(profile):
+    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
+    profile['browser'] = browser()
+    feeds = get_feeds(profile['browser'])
+    articles = parse_feeds(feeds, profile['browser'], lambda x: x.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/'))
+    index = build_index('The BBC', articles, profile['temp dir'])
+    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
+    profile['timefmt'] = ' [%a, %d %b, %Y]'
+    profile['max_recursions'] =  2                 
+    profile['title']          = 'The BBC'
+    profile['no_stylesheets'] = True
+    
+def finalize(profile):
+    if os.path.isdir(profile['temp dir']):
+        shutil.rmtree(profile['temp dir'])
+
+
+
+    
@@ -79,46 +79,53 @@ def create_lrf(htmlfile, options, logger):
    process_file(htmlfile, options, logger)

 def process_profile(args, options, logger=None):
-    if logger is None:
-        level = logging.DEBUG if options.verbose else logging.INFO
-        logger = logging.getLogger('web2lrf')
-        setup_cli_handlers(logger, level)
-    if len(args) == 2:
-        if not profiles.has_key(args[1]):
-            raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
-    profile = profiles[args[1]] if len(args) == 2 else profiles['default']
-    profile['username'] = options.username
-    profile['password'] = options.password
-    if profile.has_key('initialize'):
-        profile['initialize'](profile)
-    if profile.has_key('browser'):
-        options.browser = profile['browser']
-    
-    for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
-        val = getattr(options, opt)
-        if val is None:
-            setattr(options, opt, profile[opt])
+    tdir = None
+    try:
+        if logger is None:
+            level = logging.DEBUG if options.verbose else logging.INFO
+            logger = logging.getLogger('web2lrf')
+            setup_cli_handlers(logger, level)
+        if len(args) == 2:
+            if not profiles.has_key(args[1]):
+                raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
+        profile = profiles[args[1]] if len(args) == 2 else profiles['default']
+        profile['username'] = options.username
+        profile['password'] = options.password
+        if profile.has_key('initialize'):
+            profile['initialize'](profile)
+        if profile.has_key('browser'):
+            options.browser = profile['browser']
        
-    if not options.url:
-        raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
-    
-    if not options.title:
-        title = profile['title']
-        if not title:
-            title = urlsplit(options.url).netloc
-        options.title = title + time.strftime(profile['timefmt'], time.localtime())
-    
-    options.match_regexps += profile['match_regexps']
-    options.preprocess_regexps = profile['preprocess_regexps']
-    options.filter_regexps += profile['filter_regexps']
-    if len(args) == 2 and args[1] != 'default':
-        options.anchor_ids = False
-    
-    htmlfile, tdir = fetch_website(options, logger)
-    create_lrf(htmlfile, options, logger)
-    if profile.has_key('finalize'):
-        profile['finalize'](profile)
-    shutil.rmtree(tdir)
+        for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
+            val = getattr(options, opt)
+            if val is None:
+                setattr(options, opt, profile[opt])
+        
+        if not options.url:
+            options.url = profile['url']            
+        
+        if not options.url:
+            raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
+        
+        if not options.title:
+            title = profile['title']
+            if not title:
+                title = urlsplit(options.url).netloc
+            options.title = title + time.strftime(profile['timefmt'], time.localtime())
+        
+        options.match_regexps += profile['match_regexps']
+        options.preprocess_regexps = profile['preprocess_regexps']
+        options.filter_regexps += profile['filter_regexps']
+        if len(args) == 2 and args[1] != 'default':
+            options.anchor_ids = False
+        
+        htmlfile, tdir = fetch_website(options, logger)
+        create_lrf(htmlfile, options, logger)
+    finally:
+        if profile.has_key('finalize'):
+            profile['finalize'](profile)
+        if tdir and os.path.isdir(tdir):
+            shutil.rmtree(tdir)
    

 def main(args=sys.argv, logger=None):
@@ -13,10 +13,11 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''New York Times from RSS feeds.'''
-import time, tempfile, os, shutil, calendar, operator
+import os, tempfile, shutil

 from libprs500 import __appname__, iswindows, browser
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.lrf.web import build_index, parse_feeds

 RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
 LOGIN = 'http://www.nytimes.com/auth/login'
@@ -36,96 +37,21 @@ def get_feeds(browser):
    
    return feeds

-def parse_feeds(feeds, browser, max_articles_per_feed=10):
-    articles = {}
-    for title, url in feeds:
-        src = browser.open(url).read()
-        articles[title] = []
-        soup = BeautifulStoneSoup(src)
-        for item in soup.findAll('item'):
-            try:
-                pubdate = item.find('pubdate').string
-                if not pubdate:
-                    continue
-                pubdate = pubdate.replace('+0000', 'GMT')
-                d = { 
-                    'title'    : item.find('title').string,                 
-                    'url'      : item.find('guid').string+'?&pagewanted=print',
-                    'timestamp': calendar.timegm(time.strptime(pubdate, 
-                                                    '%a, %d %b %Y %H:%M:%S %Z')),
-                    'date'     : pubdate
-                    }
-            except:
-                continue
-            try:
-                d['description'] = item.find('description').string
-            except:
-                d['description'] = ''
-            articles[title].append(d)
-        articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
-        articles[title][max_articles_per_feed:] = []
-        for item in articles[title]:
-            item.pop('timestamp')
-    return articles
-
-def build_index(articles, dir):
-    
-        def build_sub_index(title, items):
-            ilist = ''
-            li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
-                u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
-            for item in items:
-                ilist += li%item
-            return u'''\
-            <html>
-            <body>
-            <h2>%(title)s</h2>
-            <ul>
-            %(items)s
-            </ul>
-            </body>
-            </html>
-            '''%dict(title=title, items=ilist.rstrip())        
-        
-        cnum = 0
-        clist = ''
-        categories = articles.keys()
-        categories.sort()
-        for category in categories:
-            cnum  += 1
-            cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
-            prefix = 'file:' if iswindows else ''
-            clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
-            src = build_sub_index(category, articles[category])
-            open(cfile, 'wb').write(src.encode('utf-8'))        
-        
-        src = '''\
-        <html>
-        <body>
-        <h1>The New York Times</h1>
-        <div style='text-align: right; font-weight: bold'>%(date)s</div>
-        <ul>
-        %(categories)s
-        </ul>
-        </body>
-        </html>
-        '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist)
-        index = os.path.join(dir, 'index.html')
-        open(index, 'wb').write(src.encode('utf-8'))
-        return index
-    
-             
 def initialize(profile):
    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
    profile['browser'] = login(profile)
    feeds = get_feeds(profile['browser'])
-    articles = parse_feeds(feeds, profile['browser'])
-    index = build_index(articles, profile['temp dir'])
+    articles = parse_feeds(feeds, profile['browser'], lambda x: x + '?&pagewanted=print')
+    index = build_index('The New York Times', articles, profile['temp dir'])
    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
+    profile['timefmt'] = ' [%a, %d %b, %Y]'
+    profile['max_recursions'] =  2                 
+    profile['title']          = 'The New York Times'
    
    
 def finalize(profile):
-    shutil.rmtree(profile['temp dir'])
+    if os.path.isdir(profile['temp dir']):
+        shutil.rmtree(profile['temp dir'])
 

 def login(profile):
@@ -19,6 +19,8 @@ from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
 from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
 from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
 from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
+from libprs500.ebooks.lrf.web.bbc import initialize as bbc_initialize
+from libprs500.ebooks.lrf.web.bbc import finalize as bbc_finalize


 profiles = {
@@ -42,9 +44,7 @@ profiles = {
            'nytimes' : {
                         'initialize'          : nytimes_initialize,
                         'finalize'            : nytimes_finalize,
-                         'timefmt'             :  ' [%a, %d %b, %Y]',
-                         'max_recursions'      : 2,                         
-                         'title'             : 'The New York Times',
+                         
                         'preprocess_regexps' :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
@@ -59,26 +59,24 @@ profiles = {
                         },
                         
            'bbc'     : {
-                         'title'             : 'The BBC',
-                         'no_stylesheets'    : True,
-                         'preprocess_regexps' :
+                          'initialize'          : bbc_initialize,
+                          'finalize'            : bbc_finalize,
+                          'preprocess_regexps' :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [
-                           # Remove help link and replace by title
-                           (r'<a .*?alt=.Click here for information about this service.*?</a>', 
-                            lambda match: '<h1>The BBC</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
-                           # Blank line before categories
-                           (r'<b>\s*BBC', lambda match: '<p></p><b>BBC'),
                           # Remove footer from individual stories
                           (r'<div class=.footer.>.*?Published', 
                            lambda match : '<p></p><div class="footer">Published'),
                           # Add some style info in place of disabled stylesheet
-                           (r'<link.*?type=.text/css.*?>',
-                            '<style type="text/css">.headline {font-size: x-large;}</style>'),
+                           (r'<link.*?type=.text/css.*?>', lambda match :
+                            '''<style type="text/css">
+                                .headline {font-size: x-large;}
+                                .ibox { padding: 10pt 10pt 10pt 10pt } 
+                                </style>'''),
                           ]
                          ],
-                         },
-                         
+                          },
+            
            'newsweek' : {
                          'initialize'          : newsweek_initialize,
                          'finalize'            : newsweek_finalize,