Fix Newsweek profile

This commit is contained in:
Kovid Goyal 2007-10-24 20:51:27 +00:00
parent 2d2d59a1de
commit 65b565cb59
3 changed files with 37 additions and 13 deletions

View File

@ -19,6 +19,21 @@ from libprs500 import iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
DAY_MAP = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
MONTH_MAP = dict(Jan=1, Feb=2, Mar=3, Apr=4, May=5, Jun=6, Jul=7, Aug=8, Sep=9, Oct=10, Nov=11, Dec=12)
FULL_MONTH_MAP = dict(January=1, February=2, March=3, April=4, May=5, June=6,
July=7, August=8, September=9, October=10,
November=11, December=12)
def strptime(src):
src = src.strip().split()
src[0] = str(DAY_MAP[src[0][:-1]])+','
try:
src[2] = str(MONTH_MAP[src[2]])
except KeyError:
src[2] = str(FULL_MONTH_MAP[src[2]])
return time.strptime(' '.join(src), '%w, %d %m %Y %H:%M:%S %Z')
def process_html_description(tag): def process_html_description(tag):
src = '\n'.join(tag.contents) src = '\n'.join(tag.contents)
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ] replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
@ -41,7 +56,12 @@ def parse_feeds(feeds, browser, print_version,
''' '''
articles = {} articles = {}
for title, url in feeds: for title, url in feeds:
src = browser.open(url).read() try:
src = browser.open(url).read()
except Exception, err:
print 'Could not fetch feed: %s\nError: %s'%(url, err)
continue
articles[title] = [] articles[title] = []
soup = BeautifulStoneSoup(src) soup = BeautifulStoneSoup(src)
for item in soup.findAll('item'): for item in soup.findAll('item'):
@ -53,14 +73,14 @@ def parse_feeds(feeds, browser, print_version,
d = { d = {
'title' : item.find('title').string, 'title' : item.find('title').string,
'url' : print_version(item.find('guid').string), 'url' : print_version(item.find('guid').string),
'timestamp': calendar.timegm(time.strptime(pubdate, 'timestamp': calendar.timegm(strptime(pubdate)),
'%a, %d %b %Y %H:%M:%S %Z')),
'date' : pubdate 'date' : pubdate
} }
delta = time.time() - d['timestamp'] delta = time.time() - d['timestamp']
if delta > oldest_article*3600*24: if delta > oldest_article*3600*24:
continue continue
except:
except Exception, err:
continue continue
try: try:
desc = item.find('description') desc = item.find('description')
@ -72,6 +92,8 @@ def parse_feeds(feeds, browser, print_version,
articles[title][max_articles_per_feed:] = [] articles[title][max_articles_per_feed:] = []
for item in articles[title]: for item in articles[title]:
item.pop('timestamp') item.pop('timestamp')
if not articles[title]:
articles.pop(title)
return articles return articles

View File

@ -20,30 +20,33 @@ from libprs500.ebooks.lrf.web import build_index, parse_feeds
from libprs500 import __appname__, iswindows, browser from libprs500 import __appname__, iswindows, browser
RSS_FEEDS = [ RSS_FEEDS = [
('Cover Story', 'http://feeds.newsweek.com/CoverStory'), ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'), ('Periscope', 'http://feeds.newsweek.com/newsweek/periscope'),
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
('Health', 'http://feeds.newsweek.com/headlines/health'),
('Business', 'http://feeds.newsweek.com/headlines/business'),
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'), ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'), ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'), ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
('Health', 'http://feeds.newsweek.com/sections/health'),
('Society', 'http://feeds.newsweek.com/newsweek/society'), ('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Business', 'http://feeds.newsweek.com/newsweek/business'),
('Science and Technology', 'http://feeds.newsweek.com/newsweek/TechnologyScience'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'), ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
('Tip Sheet', 'http://feeds.newsweek.com/newsweek/TipSheet/Highlights'),
] ]
def print_version(url): def print_version(url):
if '?' in url: if '?' in url:
url = url[:url.index('?')] url = url[:url.index('?')]
return url + 'print/1/displaymode/1098/' if not url.endswith('/'):
url += '/'
return url + 'output/print'
def initialize(profile): def initialize(profile):
profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
profile['browser'] = browser() profile['browser'] = browser()
articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version, articles = parse_feeds(RSS_FEEDS, profile['browser'], print_version,
max_articles_per_feed=20, html_description=True) max_articles_per_feed=20, oldest_article=15,
html_description=True)
index = build_index('Newsweek', articles, profile['temp dir']) index = build_index('Newsweek', articles, profile['temp dir'])
profile['url'] = 'file:'+ ('' if iswindows else '//') + index profile['url'] = 'file:'+ ('' if iswindows else '//') + index
profile['timefmt'] = ' [%d %b %Y]' profile['timefmt'] = ' [%d %b %Y]'

View File

@ -82,7 +82,6 @@ profiles = {
'newsweek' : { 'newsweek' : {
'initialize' : newsweek_initialize, 'initialize' : newsweek_initialize,
'finalize' : newsweek_finalize, 'finalize' : newsweek_finalize,
'no_stylesheets' : True,
'preprocess_regexps' : 'preprocess_regexps' :
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [