Implemented newsweek profile.

2025-07-09 03:04:10 -04:00 · 2007-07-14 18:27:38 +00:00 · 2007-07-14 18:27:38 +00:00 · e07305bcac
commit e07305bcac
parent 5e8fdf7b2b
4 changed files with 185 additions and 6 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.70"
+__version__   = "0.3.71"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@ -61,8 +61,7 @@ def option_parser():
    
 def fetch_website(options):
    tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
-    options.dir = tdir 
-    web2disk_setup_logger(options)
+    options.dir = tdir
    fetcher = create_fetcher(options)
    fetcher.preprocess_regexps = options.preprocess_regexps
    return fetcher.start_fetch(options.url), tdir
@ -77,6 +76,7 @@ def create_lrf(htmlfile, options):
 def main(args=sys.argv):
    parser = option_parser()
    options, args = parser.parse_args(args)
+    web2disk_setup_logger(options)
    if len(args) > 2:
        parser.print_help()
        return 1
@ -87,6 +87,9 @@ def main(args=sys.argv):
            return 1
    profile = profiles[args[1]] if len(args) == 2 else profiles['default']
    
+    if profile.has_key('initialize'):
+        profile['initialize'](profile)
+    
    for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
        val = getattr(options, opt)
        if val is None:
@ -103,7 +106,7 @@ def main(args=sys.argv):
        title = profile['title']
        if not title:
            title = urlsplit(options.url).netloc
-        options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
+        options.title = title + time.strftime(profile['timefmt'], time.localtime())
    
    options.match_regexps += profile['match_regexps']
    options.preprocess_regexps = profile['preprocess_regexps']
@ -111,6 +114,8 @@ def main(args=sys.argv):
        
    htmlfile, tdir = fetch_website(options)
    create_lrf(htmlfile, options)
+    if profile.has_key('finalize'):
+        profile['finalize'](profile)
    shutil.rmtree(tdir)
         
    return 0
--- a/src/libprs500/ebooks/lrf/web/newsweek.py
+++ b/src/libprs500/ebooks/lrf/web/newsweek.py
@ -0,0 +1,141 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
+
+import sys, urllib2, time, re, tempfile, os, shutil
+
+from libprs500 import __appname__
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+from htmlentitydefs import name2codepoint
+
+RSS_FEEDS = [
+             ('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
+             ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
+             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
+             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
+             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
+             ('Health', 'http://feeds.newsweek.com/sections/health'),
+             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
+             ('Business', 'http://feeds.newsweek.com/newsweek/business'),
+             ('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'),
+             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
+             ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
+             ]
+
+BASE_TEMPLATE=\
+u'''
+<html>
+<body>
+<h1>Newsweek</h1>
+<b align="right">%(date)s</b>
+<p></p>
+<h2>Table of Contents</h2>
+<ul>
+%(toc)s
+</ul>
+<br />
+<hr />
+</body>
+</html>
+'''
+
+SECTION_TEMPLATE=\
+u'''
+<html>
+<body>
+<h2>%(title)s</h2>
+<p></p>
+<h2>Table of Contents</h2>
+<ul>
+%(toc)s
+</ul>
+<br />
+<hr />
+</body>
+</html>
+'''
+
+_tdir = None
+def create_aggregator(sections):
+    '''Return aggregator HTML encoded in utf8'''
+    toc, sec = u'', 0
+    global _tdir
+    _tdir = tempfile.mkdtemp(prefix=__appname__)
+    for section in sections:
+        sec += 1
+        secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
+        title, contents = section
+        toc += '<li><a href="%s">%s</a></li>\n'%(secfile, title,)
+        stoc = u''
+        for item in contents:
+            desc = item['description'].strip() 
+            stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
+            if desc:
+                stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
+            stoc += '</li>\n'
+        section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
+        open(secfile, 'w').write(section.encode('utf8'))
+    index = os.path.join(_tdir, 'index.html')
+    src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
+    open(index, 'w').write(src.encode('utf8'))
+    return index
+
+def get_contents():
+    ''' Parse Newsweek RSS feeds to get links to all articles'''
+    
+    def nstounicode(ns):
+        return unicode(str(ns), 'utf8')
+    
+    def fix_link(link):
+        if '?' in link:
+            link = link[:link.index('?')]
+        return link + 'print/1/displaymode/1098/'
+    
+    def process_description(tag):
+        src = '\n'.join(tag.contents)
+        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
+        for e in replaced_entities:
+            ent = '&'+e+';'
+            src = src.replace(ent, unichr(name2codepoint[e]))
+        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
+    
+    pages = []
+    for title, url in RSS_FEEDS:
+        soup = BeautifulStoneSoup(urllib2.urlopen(url))
+        contents = []
+        for item in soup.findAll('item'):
+            d = { 
+                 'title' : nstounicode(item.title.contents[0]),
+                 'description': process_description(item.description),
+                 'link': fix_link(nstounicode(item.guid.contents[0]))
+                 }
+            if '&lt;' in d['description']:
+                d['description'] = d['description'][:d['description'].index('&lt;')]
+            contents.append(d)
+        pages.append((title, contents))
+    return pages
+
+
+def initialize(profile):
+    print 'Fetching feeds...',
+    sys.stdout.flush()
+    contents = get_contents()
+    print 'done'
+    index = create_aggregator(contents)    
+    profile['url'] = 'file://'+index
+
+def finalize(profile):
+    global _tdir
+    shutil.rmtree(_tdir)
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@ -13,9 +13,11 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Profiles for known websites.'''
-
 import time, re

+from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
+from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
+
 profiles = {
            'default' : {
                         'url'               : '',    # The URL of the website
@ -24,6 +26,7 @@ profiles = {
                         'max_files'         : 1000,  # Maximum number of files to download
                         'delay'             : 0,     # Delay between consecutive downloads
                         'timeout'           : 10,    # Timeout for fetching files from server
+                         'timefmt'           : ' [%a %d %b %Y]',
                         'no_stylesheets'    : False, # Download stylesheets 
                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
                         'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore
@ -78,7 +81,37 @@ profiles = {
                            '<style type="text/css">.headline {font-size: x-large;}</style>'),
                           ]
                          ],
-                         },                                     
+                         },
+                         
+            'newsweek' : {
+                          'initialize'          : newsweek_initialize,
+                          'finalize'            : newsweek_finalize,
+                          'title'               : 'Newsweek',
+                          'timefmt'             :  ' [%d %b %Y]',
+                          'no_stylesheets'      : True,
+                          'max_recursions'      : 2,
+                          'preprocess_regexps'  :
+                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+                          [
+                           # Make fonts larger
+                           (r'<style.*?\.copyright.*?</style>', 
+                            lambda match : \
+                        '''<style type="text/css">'''
+                        '''updateTime{font:small Arial;color:#000000;}'''
+                        '''.credit{font:small Arial;color:#999999;}'''
+                        '''.head{font:bold 18pt x-large;color:#CC0000;}'''
+                        '''.abstract{font:14pt large Verdana;color:#000000;}'''
+                        '''.title{font:bold;color:#000000;}'''
+                        '''.source{font:bold small Verdana;color:#CC0000;}'''
+                        '''.footerLink{font:bold Verdana;color:#000000;}'''
+                        '''.caption{font: Verdana;color:#000000;}'''
+                        '''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
+                        '''.copyright{font-style:italic;}'''
+                        '''</style>'''
+                            ),
+                           ]
+                          ],
+                          },                                    
            }

 for key in profiles.keys():