From e07305bcacdc227ec36c16648e997ec438b823ff Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 14 Jul 2007 18:27:38 +0000
Subject: [PATCH] Implemented newsweek profile.

---
 src/libprs500/__init__.py                    |   2 +-
 src/libprs500/ebooks/lrf/web/convert_from.py |  11 +-
 src/libprs500/ebooks/lrf/web/newsweek.py     | 141 +++++++++++++++++++
 src/libprs500/ebooks/lrf/web/profiles.py     |  37 ++++-
 4 files changed, 185 insertions(+), 6 deletions(-)
 create mode 100644 src/libprs500/ebooks/lrf/web/newsweek.py

diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py
index 1b2963a5fa..66127b3d38 100644
--- a/src/libprs500/__init__.py
+++ b/src/libprs500/__init__.py
@@ -13,7 +13,7 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ''' E-book management software'''
-__version__   = "0.3.70"
+__version__   = "0.3.71"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 __appname__   = 'libprs500'
diff --git a/src/libprs500/ebooks/lrf/web/convert_from.py b/src/libprs500/ebooks/lrf/web/convert_from.py
index 23a767599e..33696b5ccf 100644
--- a/src/libprs500/ebooks/lrf/web/convert_from.py
+++ b/src/libprs500/ebooks/lrf/web/convert_from.py
@@ -61,8 +61,7 @@ def option_parser():
     
 def fetch_website(options):
     tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
-    options.dir = tdir 
-    web2disk_setup_logger(options)
+    options.dir = tdir
     fetcher = create_fetcher(options)
     fetcher.preprocess_regexps = options.preprocess_regexps
     return fetcher.start_fetch(options.url), tdir
@@ -77,6 +76,7 @@ def create_lrf(htmlfile, options):
 def main(args=sys.argv):
     parser = option_parser()
     options, args = parser.parse_args(args)
+    web2disk_setup_logger(options)
     if len(args) > 2:
         parser.print_help()
         return 1
@@ -87,6 +87,9 @@ def main(args=sys.argv):
             return 1
     profile = profiles[args[1]] if len(args) == 2 else profiles['default']
     
+    if profile.has_key('initialize'):
+        profile['initialize'](profile)
+    
     for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
         val = getattr(options, opt)
         if val is None:
@@ -103,7 +106,7 @@ def main(args=sys.argv):
         title = profile['title']
         if not title:
             title = urlsplit(options.url).netloc
-        options.title = title + time.strftime(' [%a %d %b %Y]', time.localtime())
+        options.title = title + time.strftime(profile['timefmt'], time.localtime())
     
     options.match_regexps += profile['match_regexps']
     options.preprocess_regexps = profile['preprocess_regexps']
@@ -111,6 +114,8 @@ def main(args=sys.argv):
         
     htmlfile, tdir = fetch_website(options)
     create_lrf(htmlfile, options)
+    if profile.has_key('finalize'):
+        profile['finalize'](profile)
     shutil.rmtree(tdir)
          
     return 0
diff --git a/src/libprs500/ebooks/lrf/web/newsweek.py b/src/libprs500/ebooks/lrf/web/newsweek.py
new file mode 100644
index 0000000000..daf3a9641b
--- /dev/null
+++ b/src/libprs500/ebooks/lrf/web/newsweek.py
@@ -0,0 +1,141 @@
+##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''Logic to create a Newsweek HTML aggregator from RSS feeds'''
+
+import sys, urllib2, time, re, tempfile, os, shutil
+
+from libprs500 import __appname__
+from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+from htmlentitydefs import name2codepoint
+
+RSS_FEEDS = [
+             ('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
+             ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
+             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
+             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
+             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
+             ('Health', 'http://feeds.newsweek.com/sections/health'),
+             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
+             ('Business', 'http://feeds.newsweek.com/newsweek/business'),
+             ('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'),
+             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
+             ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
+             ]
+
+BASE_TEMPLATE=\
+u'''
+<html>
+<body>
+<h1>Newsweek</h1>
+<b align="right">%(date)s</b>
+<p></p>
+<h2>Table of Contents</h2>
+<ul>
+%(toc)s
+</ul>
+<br />
+<hr />
+</body>
+</html>
+'''
+
+SECTION_TEMPLATE=\
+u'''
+<html>
+<body>
+<h2>%(title)s</h2>
+<p></p>
+<h2>Table of Contents</h2>
+<ul>
+%(toc)s
+</ul>
+<br />
+<hr />
+</body>
+</html>
+'''
+
+_tdir = None
+def create_aggregator(sections):
+    '''Return aggregator HTML encoded in utf8'''
+    toc, sec = u'', 0
+    global _tdir
+    _tdir = tempfile.mkdtemp(prefix=__appname__)
+    for section in sections:
+        sec += 1
+        secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
+        title, contents = section
+        toc += '<li><a href="%s">%s</a></li>\n'%(secfile, title,)
+        stoc = u''
+        for item in contents:
+            desc = item['description'].strip() 
+            stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
+            if desc:
+                stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
+            stoc += '</li>\n'
+        section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
+        open(secfile, 'w').write(section.encode('utf8'))
+    index = os.path.join(_tdir, 'index.html')
+    src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
+    open(index, 'w').write(src.encode('utf8'))
+    return index
+
+def get_contents():
+    ''' Parse Newsweek RSS feeds to get links to all articles'''
+    
+    def nstounicode(ns):
+        return unicode(str(ns), 'utf8')
+    
+    def fix_link(link):
+        if '?' in link:
+            link = link[:link.index('?')]
+        return link + 'print/1/displaymode/1098/'
+    
+    def process_description(tag):
+        src = '\n'.join(tag.contents)
+        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
+        for e in replaced_entities:
+            ent = '&'+e+';'
+            src = src.replace(ent, unichr(name2codepoint[e]))
+        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
+    
+    pages = []
+    for title, url in RSS_FEEDS:
+        soup = BeautifulStoneSoup(urllib2.urlopen(url))
+        contents = []
+        for item in soup.findAll('item'):
+            d = { 
+                 'title' : nstounicode(item.title.contents[0]),
+                 'description': process_description(item.description),
+                 'link': fix_link(nstounicode(item.guid.contents[0]))
+                 }
+            if '&lt;' in d['description']:
+                d['description'] = d['description'][:d['description'].index('&lt;')]
+            contents.append(d)
+        pages.append((title, contents))
+    return pages
+
+
+def initialize(profile):
+    print 'Fetching feeds...',
+    sys.stdout.flush()
+    contents = get_contents()
+    print 'done'
+    index = create_aggregator(contents)    
+    profile['url'] = 'file://'+index
+
+def finalize(profile):
+    global _tdir
+    shutil.rmtree(_tdir)
diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py
index 4b363857cc..8ca0a228d6 100644
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@@ -13,9 +13,11 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Profiles for known websites.'''
-
 import time, re
 
+from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
+from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
+
 profiles = {
             'default' : {
                          'url'               : '',    # The URL of the website
@@ -24,6 +26,7 @@ profiles = {
                          'max_files'         : 1000,  # Maximum number of files to download
                          'delay'             : 0,     # Delay between consecutive downloads
                          'timeout'           : 10,    # Timeout for fetching files from server
+                         'timefmt'           : ' [%a %d %b %Y]',
                          'no_stylesheets'    : False, # Download stylesheets 
                          'match_regexps'     : [],    # List of regular expressions that determines which links to follow
                          'filter_regexps'    : [],    # List of regular expressions that determines which links to ignore
@@ -78,7 +81,37 @@ profiles = {
                             '<style type="text/css">.headline {font-size: x-large;}</style>'),
                            ]
                           ],
-                         },                                     
+                         },
+                         
+            'newsweek' : {
+                          'initialize'          : newsweek_initialize,
+                          'finalize'            : newsweek_finalize,
+                          'title'               : 'Newsweek',
+                          'timefmt'             :  ' [%d %b %Y]',
+                          'no_stylesheets'      : True,
+                          'max_recursions'      : 2,
+                          'preprocess_regexps'  :
+                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+                          [
+                           # Make fonts larger
+                           (r'<style.*?\.copyright.*?</style>', 
+                            lambda match : \
+                        '''<style type="text/css">'''
+                        '''updateTime{font:small Arial;color:#000000;}'''
+                        '''.credit{font:small Arial;color:#999999;}'''
+                        '''.head{font:bold 18pt x-large;color:#CC0000;}'''
+                        '''.abstract{font:14pt large Verdana;color:#000000;}'''
+                        '''.title{font:bold;color:#000000;}'''
+                        '''.source{font:bold small Verdana;color:#CC0000;}'''
+                        '''.footerLink{font:bold Verdana;color:#000000;}'''
+                        '''.caption{font: Verdana;color:#000000;}'''
+                        '''.textBodyBlack, .copyright{font: Verdana;color:#000000;}'''
+                        '''.copyright{font-style:italic;}'''
+                        '''</style>'''
+                            ),
+                           ]
+                          ],
+                          },                                    
             }
 
 for key in profiles.keys():