Improved NY Times profile that corresponds to the daily paper

This commit is contained in:
Kovid Goyal 2008-02-28 04:21:20 +00:00
parent 1ddcfb5844
commit ec79457d45

View File

@ -15,7 +15,7 @@
'''
Profile to download the New York Times
'''
import re
import re, time
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = True
max_recursions = 2
recommended_frequency = 1
encoding = 'cp1252'
html2lrf_options = ['--base-font-size=0']
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
br.submit()
return br
def get_feeds(self):
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
soup = BeautifulSoup(src[src.index('<html'):])
feeds = []
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
'Dining & Wine', 'Home & Garden', 'Multimedia',
'Most E-mailed Articles',
'Automobiles', 'Fashion & Style', 'Television News',
'Education']:
feeds.append((link['title'], link['href'].replace('graphics8', 'www')))
def parse_feeds(self):
src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
soup = BeautifulSoup(src)
return feeds
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline':
key = feed_title(div)
articles[key] = []
elif div['class'] in ['story', 'story headline']:
a = div.find('a', href=True)
if not a:
continue
url = self.print_version(a['href'])
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = time.strftime('%a, %d %b', time.localtime())
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate, description=description,
content=''))
return articles
def print_version(self, url):
return url + '?&pagewanted=print'