diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index 5523ac8b1a..aa1fa2d0a5 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -13,12 +13,32 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import os, time, calendar, operator +import os, time, calendar, operator, re from libprs500 import iswindows from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup +from htmlentitydefs import name2codepoint -def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10): +def process_html_description(tag): + src = '\n'.join(tag.contents) + replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] + for e in replaced_entities: + ent = '&'+e+';' + src = src.replace(ent, unichr(name2codepoint[e])) + return re.compile(r'', re.IGNORECASE|re.DOTALL).sub('', src) + +def parse_feeds(feeds, browser, print_version, + max_articles_per_feed=10, + html_description=False, + oldest_article=7): + ''' + @param print_version: Callable that takes a url string and returns the url to + printable version of the article pointed to by the original url. + @param max_articles_per_feed: Maximum number of articles to download from each feed + @param html_description: If true the atricles descriptions are processed as HTML + @param oldest_article: A number in days. No articles older than now - oldest_aticle + will be downloaded. + ''' articles = {} for title, url in feeds: src = browser.open(url).read() @@ -37,10 +57,14 @@ def parse_feeds(feeds, browser, print_version, max_articles_per_feed=10): '%a, %d %b %Y %H:%M:%S %Z')), 'date' : pubdate } + delta = time.time() - d['timestamp'] + if delta > oldest_article*3600*24: + continue except: continue try: - d['description'] = item.find('description').string + desc = item.find('description') + d['description'] = process_html_description(desc) if html_description else desc.string except: d['description'] = '' articles[title].append(d) diff --git a/src/libprs500/ebooks/lrf/web/economist.py b/src/libprs500/ebooks/lrf/web/economist.py index 05940397d1..dfb3f2fbfd 100644 --- a/src/libprs500/ebooks/lrf/web/economist.py +++ b/src/libprs500/ebooks/lrf/web/economist.py @@ -76,7 +76,6 @@ def initialize(profile): profile.pop('browser') # Needed as for some reason using the same browser instance causes timeouts def finalize(profile): - return if os.path.isdir(profile['temp dir']): shutil.rmtree(profile['temp dir']) \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/newsweek.py b/src/libprs500/ebooks/lrf/web/newsweek.py index ab77c9be0a..615c9b7e4e 100644 --- a/src/libprs500/ebooks/lrf/web/newsweek.py +++ b/src/libprs500/ebooks/lrf/web/newsweek.py @@ -16,9 +16,8 @@ import sys, urllib2, time, re, tempfile, os, shutil -from libprs500 import __appname__, iswindows -from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup -from htmlentitydefs import name2codepoint +from libprs500.ebooks.lrf.web import build_index, parse_feeds +from libprs500 import __appname__, iswindows, browser RSS_FEEDS = [ ('Cover Story', 'http://feeds.newsweek.com/CoverStory'), @@ -34,110 +33,24 @@ RSS_FEEDS = [ ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'), ] -BASE_TEMPLATE=\ -u''' - - -

Newsweek

-%(date)s -

-

Table of Contents

- -
-
- - -''' - -SECTION_TEMPLATE=\ -u''' - - -

%(title)s

-

-

Table of Contents

- -
-
- - -''' - -_tdir = None -def create_aggregator(sections): - '''Return aggregator HTML encoded in utf8''' - toc, sec = u'', 0 - global _tdir - _tdir = tempfile.mkdtemp(prefix=__appname__) - for section in sections: - sec += 1 - secfile = os.path.join(_tdir, 'sec%d.html'%(sec,)) - title, contents = section - fix = 'file:' if iswindows else '' - toc += '
  • %s
  • \n'%(fix+secfile, title,) - stoc = u'' - for item in contents: - desc = item['description'].strip() - stoc += '
  • %(title)s
    '%dict(link=item['link'], title=item['title']) - if desc: - stoc += '
    %s
    \n'%(desc,) - stoc += '
  • \n' - section = SECTION_TEMPLATE%dict(title=title, toc=stoc) - open(secfile, 'w').write(section.encode('utf8')) - index = os.path.join(_tdir, 'index.html') - src = BASE_TEMPLATE % dict(toc=toc, date=time.strftime('%d %B, %Y', time.localtime())) - open(index, 'w').write(src.encode('utf8')) - return index - -def get_contents(): - ''' Parse Newsweek RSS feeds to get links to all articles''' - - def nstounicode(ns): - return unicode(str(ns), 'utf8') - - def fix_link(link): - if '?' in link: - link = link[:link.index('?')] - return link + 'print/1/displaymode/1098/' - - def process_description(tag): - src = '\n'.join(tag.contents) - replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] - for e in replaced_entities: - ent = '&'+e+';' - src = src.replace(ent, unichr(name2codepoint[e])) - return re.compile(r'', re.IGNORECASE|re.DOTALL).sub('', src) - - pages = [] - for title, url in RSS_FEEDS: - soup = BeautifulStoneSoup(urllib2.urlopen(url)) - contents = [] - for item in soup.findAll('item'): - d = { - 'title' : nstounicode(item.title.contents[0]), - 'description': process_description(item.description), - 'link': fix_link(nstounicode(item.guid.contents[0])) - } - if '<' in d['description']: - d['description'] = d['description'][:d['description'].index('<')] - contents.append(d) - pages.append((title, contents)) - return pages +def print_version(url): + if '?' in url: + url = url[:url.index('?')] + return url + 'print/1/displaymode/1098/' def initialize(profile): - print 'Fetching feeds...', - sys.stdout.flush() - contents = get_contents() - print 'done' - index = create_aggregator(contents) - + profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') + profile['browser'] = browser() + articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, + max_articles_per_feed=20, html_description=True) + index = build_index('Newsweek', articles, profile['temp dir']) + profile['url'] = 'file:'+ ('' if iswindows else '//') + index + profile['timefmt'] = ' [%d %b %Y]' + profile['max_recursions'] = 2 + profile['title'] = 'Newsweek' profile['url'] = 'file:'+ ('' if iswindows else '//') +index def finalize(profile): - global _tdir - shutil.rmtree(_tdir) + if os.path.isdir(profile['temp dir']): + shutil.rmtree(profile['temp dir']) diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py index 760a957312..d337de686f 100644 --- a/src/libprs500/ebooks/lrf/web/profiles.py +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -32,7 +32,7 @@ profiles = { 'max_recursions' : 1, # Number of levels of links to follow 'max_files' : 1000, # Maximum number of files to download 'delay' : 0, # Delay between consecutive downloads - 'timeout' : 10, # Timeout for fetching files from server + 'timeout' : 10, # Timeout for fetching files from server in seconds 'timefmt' : ' [%a %d %b %Y]', 'no_stylesheets' : False, # Download stylesheets 'match_regexps' : [], # List of regular expressions that determines which links to follow @@ -82,10 +82,7 @@ profiles = { 'newsweek' : { 'initialize' : newsweek_initialize, 'finalize' : newsweek_finalize, - 'title' : 'Newsweek', - 'timefmt' : ' [%d %b %Y]', 'no_stylesheets' : True, - 'max_recursions' : 2, 'preprocess_regexps' : [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [