Implement #229

2025-07-09 03:04:10 -04:00 · 2007-10-07 20:12:25 +00:00 · 2007-10-07 20:12:25 +00:00 · b1dea424fe
commit b1dea424fe
parent 4477a78a5b
4 changed files with 45 additions and 112 deletions
--- a/src/libprs500/ebooks/lrf/web/init.py
+++ b/src/libprs500/ebooks/lrf/web/init.py
@ -13,12 +13,32 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-import os, time, calendar, operator
+import os, time, calendar, operator, re
 from libprs500 import iswindows
 from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 from htmlentitydefs import name2codepoint
-def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
+def process_html_description(tag):
        src = '\n'.join(tag.contents)
        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
        for e in replaced_entities:
            ent = '&'+e+';'
            src = src.replace(ent, unichr(name2codepoint[e]))
        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
 def parse_feeds(feeds, browser, print_version, 
                max_articles_per_feed=10, 
                html_description=False,
                oldest_article=7):
    '''
    @param print_version: Callable that takes a url string and returns the url to 
                          printable version of the article pointed to by the original url.
    @param max_articles_per_feed: Maximum number of articles to download from each feed
    @param html_description: If true the atricles descriptions are processed as HTML
    @param oldest_article: A number in days. No articles older than now - oldest_aticle 
                           will be downloaded.  
    '''
    articles = {}
    for title, url in feeds:
        src = browser.open(url).read()
@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10):
                                                    '%a, %d %b %Y %H:%M:%S %Z')),
                    'date'     : pubdate
                    }
                delta = time.time() - d['timestamp']
                if delta > oldest_article*3600*24:
                    continue 
            except:
                continue
            try:
-                d['description'] = item.find('description').string
+                desc = item.find('description')
                d['description'] = process_html_description(desc) if  html_description else desc.string                    
            except:
                d['description'] = ''
            articles[title].append(d)
--- a/src/libprs500/ebooks/lrf/web/economist.py
+++ b/src/libprs500/ebooks/lrf/web/economist.py
@ -76,7 +76,6 @@ def initialize(profile):
    profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts
 def finalize(profile):
    return
    if os.path.isdir(profile['temp dir']):
        shutil.rmtree(profile['temp dir'])
--- a/src/libprs500/ebooks/lrf/web/newsweek.py
+++ b/src/libprs500/ebooks/lrf/web/newsweek.py
@ -16,9 +16,8 @@
 import sys, urllib2, time, re, tempfile, os, shutil
-from libprs500 import __appname__, iswindows
+from libprs500.ebooks.lrf.web import build_index, parse_feeds
-from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
+from libprs500 import __appname__, iswindows, browser
 from htmlentitydefs import name2codepoint
 RSS_FEEDS = [
             ('Cover Story', 'http://feeds.newsweek.com/CoverStory'),
@ -34,110 +33,24 @@ RSS_FEEDS = [
             ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
             ]
 BASE_TEMPLATE=\
 u'''
 <html>
 <body>
 <h1>Newsweek</h1>
 <b align="right">%(date)s</b>
 <p></p>
 <h2>Table of Contents</h2>
 <ul>
 %(toc)s
 </ul>
 <br />
 <hr />
 </body>
 </html>
 '''
 SECTION_TEMPLATE=\
 u'''
 <html>
 <body>
 <h2>%(title)s</h2>
 <p></p>
 <h2>Table of Contents</h2>
 <ul>
 %(toc)s
 </ul>
 <br />
 <hr />
 </body>
 </html>
 '''
 _tdir = None
 def create_aggregator(sections):
    '''Return aggregator HTML encoded in utf8'''
    toc, sec = u'', 0
    global _tdir
    _tdir = tempfile.mkdtemp(prefix=__appname__)
    for section in sections:
        sec += 1
        secfile = os.path.join(_tdir, 'sec%d.html'%(sec,))
        title, contents = section
        fix = 'file:' if iswindows else ''
        toc += '<li><a href="%s">%s</a></li>\n'%(fix+secfile, title,)
        stoc = u''
        for item in contents:
            desc = item['description'].strip() 
            stoc += '<li><a href="%(link)s">%(title)s</a><br />'%dict(link=item['link'], title=item['title'])
            if desc:
                stoc += '<div style="font-size:small; font-family:sans">%s</div>\n'%(desc,)
            stoc += '</li>\n'
        section = SECTION_TEMPLATE%dict(title=title, toc=stoc)
        open(secfile, 'w').write(section.encode('utf8'))
    index = os.path.join(_tdir, 'index.html')
    src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime()))
    open(index, 'w').write(src.encode('utf8'))
    return index
 def get_contents():
    ''' Parse Newsweek RSS feeds to get links to all articles'''
    def nstounicode(ns):
        return unicode(str(ns), 'utf8')
    def fix_link(link):
        if '?' in link:
            link = link[:link.index('?')]
        return link + 'print/1/displaymode/1098/'
    def process_description(tag):
        src = '\n'.join(tag.contents)
        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
        for e in replaced_entities:
            ent = '&'+e+';'
            src = src.replace(ent, unichr(name2codepoint[e]))
        return re.compile(r'<a.*?</a>', re.IGNORECASE|re.DOTALL).sub('', src)
    pages = []
    for title, url in RSS_FEEDS:
        soup = BeautifulStoneSoup(urllib2.urlopen(url))
        contents = []
        for item in soup.findAll('item'):
            d = { 
                 'title' : nstounicode(item.title.contents[0]),
                 'description': process_description(item.description),
                 'link': fix_link(nstounicode(item.guid.contents[0]))
                 }
            if '&lt;' in d['description']:
                d['description'] = d['description'][:d['description'].index('&lt;')]
            contents.append(d)
        pages.append((title, contents))
    return pages
 def print_version(url):
    if '?' in url:
        url = url[:url.index('?')]
    return url + 'print/1/displaymode/1098/'
 def initialize(profile):
-    print 'Fetching feeds...',
+    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
-    sys.stdout.flush()
+    profile['browser'] = browser()
-    contents = get_contents()
+    articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, 
-    print 'done'
+                           max_articles_per_feed=20, html_description=True)
-    index = create_aggregator(contents)
+    index = build_index('Newsweek', articles, profile['temp dir'])
-     
+    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
    profile['timefmt'] = ' [%d %b %Y]'
    profile['max_recursions'] =  2
    profile['title']          = 'Newsweek'
    profile['url'] = 'file:'+ ('' if iswindows else '//') +index
 def finalize(profile):
-    global _tdir
+    if os.path.isdir(profile['temp dir']):
-    shutil.rmtree(_tdir)
+        shutil.rmtree(profile['temp dir'])
--- a/src/libprs500/ebooks/lrf/web/profiles.py
+++ b/src/libprs500/ebooks/lrf/web/profiles.py
@ -32,7 +32,7 @@ profiles = {
                         'max_recursions'    : 1,     # Number of levels of links to follow
                         'max_files'         : 1000,  # Maximum number of files to download
                         'delay'             : 0,     # Delay between consecutive downloads
-                         'timeout'           : 10,    # Timeout for fetching files from server
+                         'timeout'           : 10,    # Timeout for fetching files from server in seconds
                         'timefmt'           : ' [%a %d %b %Y]',
                         'no_stylesheets'    : False, # Download stylesheets 
                         'match_regexps'     : [],    # List of regular expressions that determines which links to follow
@ -82,10 +82,7 @@ profiles = {
            'newsweek' : {
                          'initialize'          : newsweek_initialize,
                          'finalize'            : newsweek_finalize,
                          'title'               : 'Newsweek',
                          'timefmt'             :  ' [%d %b %Y]',
                          'no_stylesheets'      : True,
                          'max_recursions'      : 2,
                          'preprocess_regexps'  :
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
                          [