Implement #229

2025-07-09 03:04:10 -04:00 · 2007-10-07 20:12:25 +00:00 · 2007-10-07 20:12:25 +00:00 · b1dea424fe
commit b1dea424fe
parent 4477a78a5b
4 changed files with 45 additions and 112 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -13,12 +13,32 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-import os, time, calendar, operator
+import os, time, calendar, operator, re

 from libprs500 import iswindows
 from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+from htmlentitydefs import name2codepoint

-def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
+def process_html_description(tag):
+        src = '\n'.join(tag.contents)
+        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
+        for e in replaced_entities:
+            ent = '&'+e+';'
+            src = src.replace(ent, unichr(name2codepoint[e]))
+        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
+
+def parse_feeds(feeds, browser, print_version, 
+                max_articles_per_feed=10, 
+                html_description=False,
+                oldest_article=7):
+    '''
+    @param print_version: Callable that takes a url string and returns the url to 
+                          printable version of the article pointed to by the original url.
+    @param max_articles_per_feed: Maximum number of articles to download from each feed
+    @param html_description: If true the atricles descriptions are processed as HTML
+    @param oldest_article: A number in days. No articles older than now - oldest_aticle 
+                           will be downloaded.  
+    '''
    articles = {}
    for title, url in feeds:
        src = browser.open(url).read()
@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
                                                    '%a, %d %b %Y %H:%M:%S %Z')),
                    'date'     : pubdate
                    }
+                delta = time.time() - d['timestamp']
+                if delta > oldest_article*3600*24:
+                    continue 
            except:
                continue
            try:
-                d['description'] = item.find('description').string
+                desc = item.find('description')
+                d['description'] = process_html_description(desc) if  html_description else desc.string                    
            except:
                d['description'] = ''
            articles[title].append(d)
--- a/src/libprs500/ebooks/lrf/web/economist.py
+++ b/src/libprs500/ebooks/lrf/web/economist.py
@ -76,7 +76,6 @@ def initialize(profile):
    profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
        
 def finalize(profile):
-    return
    if os.path.isdir(profile['temp dir']):
        shutil.rmtree(profile['temp dir'])
    
--- a/src/libprs500/ebooks/lrf/web/newsweek.py
+++ b/src/libprs500/ebooks/lrf/web/newsweek.py
@ -16,9 +16,8 @@

 import sys, urllib2, time, re, tempfile, os, shutil

-from libprs500 import __appname__, iswindows
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
-from htmlentitydefs import name2codepoint
+from libprs500.ebooks.lrf.web import build_index, parse_feeds
+from libprs500 import __appname__, iswindows, browser

 RSS_FEEDS = [
             ('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
@ -34,110 +33,24 @@ RSS_FEEDS = [
             ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
             ]

-BASE_TEMPLATE=\
-u'''
-<html>
-<body>
-<h1>Newsweek</h1>
-<b align="right">%(date)s</b>
-<p></p>
-<h2>Table of Contents</h2>
-<ul>
-%(toc)s
-</ul>
-<br />
-<hr />
-</body>
-</html>
-'''
-
-SECTION_TEMPLATE=\
-u'''
-<html>
-<body>
-<h2>%(title)s</h2>
-<p></p>
-<h2>Table of Contents</h2>
-<ul>
-%(toc)s
-</ul>
-<br />
-<hr />
-</body>
-</html>
-'''
-
-_tdir = None
-def create_aggregator(sections):
-    '''Return aggregator HTML encoded in utf8'''
-    toc, sec = u'', 0
-    global _tdir
-    _tdir = tempfile.mkdtemp(prefix=__appname__)
-    for section in sections:
-        sec += 1
-        secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
-        title, contents = section
-        fix = 'file:' if iswindows else ''
-        toc += '<li><a href="%s">%s</a></li>\n'%(fix+secfile, title,)
-        stoc = u''
-        for item in contents:
-            desc = item['description'].strip() 
-            stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
-            if desc:
-                stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
-            stoc += '</li>\n'
-        section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
-        open(secfile, 'w').write(section.encode('utf8'))
-    index = os.path.join(_tdir, 'index.html')
-    src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
-    open(index, 'w').write(src.encode('utf8'))
-    return index
-
-def get_contents():
-    ''' Parse Newsweek RSS feeds to get links to all articles'''
-    
-    def nstounicode(ns):
-        return unicode(str(ns), 'utf8')
-    
-    def fix_link(link):
-        if '?' in link:
-            link = link[:link.index('?')]
-        return link + 'print/1/displaymode/1098/'
-    
-    def process_description(tag):
-        src = '\n'.join(tag.contents)
-        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
-        for e in replaced_entities:
-            ent = '&'+e+';'
-            src = src.replace(ent, unichr(name2codepoint[e]))
-        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
-    
-    pages = []
-    for title, url in RSS_FEEDS:
-        soup = BeautifulStoneSoup(urllib2.urlopen(url))
-        contents = []
-        for item in soup.findAll('item'):
-            d = { 
-                 'title' : nstounicode(item.title.contents[0]),
-                 'description': process_description(item.description),
-                 'link': fix_link(nstounicode(item.guid.contents[0]))
-                 }
-            if '&lt;' in d['description']:
-                d['description'] = d['description'][:d['description'].index('&lt;')]
-            contents.append(d)
-        pages.append((title, contents))
-    return pages

+def print_version(url):
+    if '?' in url:
+        url = url[:url.index('?')]
+    return url + 'print/1/displaymode/1098/'

 def initialize(profile):
-    print 'Fetching feeds...',
-    sys.stdout.flush()
-    contents = get_contents()
-    print 'done'
-    index = create_aggregator(contents)
-     
+    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
+    profile['browser'] = browser()
+    articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, 
+                           max_articles_per_feed=20, html_description=True)
+    index = build_index('Newsweek', articles, profile['temp dir'])
+    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
+    profile['timefmt'] = ' [%d %b %Y]'
+    profile['max_recursions'] =  2
+    profile['title']          = 'Newsweek'
    profile['url'] = 'file:'+ ('' if iswindows else '//') +index

 def finalize(profile):
-    global _tdir
-    shutil.rmtree(_tdir)
+    if os.path.isdir(profile['temp dir']):
+        shutil.rmtree(profile['temp dir'])
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@ -32,7 +32,7 @@ profiles = {
                         'max_recursions'    : 1,     # Number of levels of links to follow
                         'max_files'         : 1000,  # Maximum number of files to download
                         'delay'             : 0,     # Delay between consecutive downloads
-                         'timeout'           : 10,    # Timeout for fetching files from server
+                         'timeout'           : 10,    # Timeout for fetching files from server in seconds
                         'timefmt'           : ' [%a %d %b %Y]',
                         'no_stylesheets'    : False, # Download stylesheets 
                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
@ -82,10 +82,7 @@ profiles = {
            'newsweek' : {
                          'initialize'          : newsweek_initialize,
                          'finalize'            : newsweek_finalize,
-                          'title'               : 'Newsweek',
-                          'timefmt'             :  ' [%d %b %Y]',
                          'no_stylesheets'      : True,
-                          'max_recursions'      : 2,
                          'preprocess_regexps'  :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [