From 65b565cb59469d7f5fa2f483f8795682f28f0f8e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Oct 2007 20:51:27 +0000 Subject: [PATCH] Fix Newsweek profile --- src/libprs500/ebooks/lrf/web/__init__.py | 32 ++++++++++++++++++++---- src/libprs500/ebooks/lrf/web/newsweek.py | 17 +++++++------ src/libprs500/ebooks/lrf/web/profiles.py | 1 - 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index aa1fa2d0a5..1f3ab776df 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -19,6 +19,21 @@ from libprs500 import iswindows from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup from htmlentitydefs import name2codepoint +DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6) +MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12) +FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6, + July=7, August=8, September=9, October=10, + November=11, December=12) + +def strptime(src): + src = src.strip().split() + src[0] = str(DAY_MAP[src[0][:-1]])+',' + try: + src[2] = str(MONTH_MAP[src[2]]) + except KeyError: + src[2] = str(FULL_MONTH_MAP[src[2]]) + return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z') + def process_html_description(tag): src = '\n'.join(tag.contents) replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] @@ -41,7 +56,12 @@ def parse_feeds(feeds, browser, print_version, ''' articles = {} for title, url in feeds: - src = browser.open(url).read() + try: + src = browser.open(url).read() + except Exception, err: + print 'Could not fetch feed: %s\nError: %s'%(url, err) + continue + articles[title] = [] soup = BeautifulStoneSoup(src) for item in soup.findAll('item'): @@ -53,14 +73,14 @@ def parse_feeds(feeds, browser, print_version, d = { 'title' : item.find('title').string, 'url' : print_version(item.find('guid').string), - 'timestamp': calendar.timegm(time.strptime(pubdate, - '%a, %d %b %Y %H:%M:%S %Z')), + 'timestamp': calendar.timegm(strptime(pubdate)), 'date' : pubdate } delta = time.time() - d['timestamp'] if delta > oldest_article*3600*24: - continue - except: + continue + + except Exception, err: continue try: desc = item.find('description') @@ -72,6 +92,8 @@ def parse_feeds(feeds, browser, print_version, articles[title][max_articles_per_feed:] = [] for item in articles[title]: item.pop('timestamp') + if not articles[title]: + articles.pop(title) return articles diff --git a/src/libprs500/ebooks/lrf/web/newsweek.py b/src/libprs500/ebooks/lrf/web/newsweek.py index 572ed24a15..3638193c41 100644 --- a/src/libprs500/ebooks/lrf/web/newsweek.py +++ b/src/libprs500/ebooks/lrf/web/newsweek.py @@ -20,30 +20,33 @@ from libprs500.ebooks.lrf.web import build_index, parse_feeds from libprs500 import __appname__, iswindows, browser RSS_FEEDS = [ - ('Cover Story', 'http://feeds.newsweek.com/CoverStory'), + ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',), ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'), + ('Politics', 'http://feeds.newsweek.com/headlines/politics'), + ('Health', 'http://feeds.newsweek.com/headlines/health'), + ('Business', 'http://feeds.newsweek.com/headlines/business'), + ('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'), ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'), ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'), ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'), - ('Health', 'http://feeds.newsweek.com/sections/health'), ('Society', 'http://feeds.newsweek.com/newsweek/society'), - ('Business', 'http://feeds.newsweek.com/newsweek/business'), - ('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'), ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'), - ('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'), ] def print_version(url): if '?' in url: url = url[:url.index('?')] - return url + 'print/1/displaymode/1098/' + if not url.endswith('/'): + url += '/' + return url + 'output/print' def initialize(profile): profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') profile['browser'] = browser() articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, - max_articles_per_feed=20, html_description=True) + max_articles_per_feed=20, oldest_article=15, + html_description=True) index = build_index('Newsweek', articles, profile['temp dir']) profile['url'] = 'file:'+ ('' if iswindows else '//') + index profile['timefmt'] = ' [%d %b %Y]' diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py index d337de686f..26a2daf373 100644 --- a/src/libprs500/ebooks/lrf/web/profiles.py +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -82,7 +82,6 @@ profiles = { 'newsweek' : { 'initialize' : newsweek_initialize, 'finalize' : newsweek_finalize, - 'no_stylesheets' : True, 'preprocess_regexps' : [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [