Improved NY Times profile that corresponds to the daily paper

This commit is contained in:
Kovid Goyal 2008-02-28 04:21:20 +00:00
parent 1ddcfb5844
commit ec79457d45

View File

@ -15,7 +15,7 @@
''' '''
Profile to download the New York Times Profile to download the New York Times
''' '''
import re import re, time
from libprs500.ebooks.lrf.web.profiles import DefaultProfile from libprs500.ebooks.lrf.web.profiles import DefaultProfile
from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulSoup
@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
timefmt = ' [%a, %d %b, %Y]' timefmt = ' [%a, %d %b, %Y]'
needs_subscription = True needs_subscription = True
max_recursions = 2 max_recursions = 2
recommended_frequency = 1
encoding = 'cp1252'
html2lrf_options = ['--base-font-size=0']
preprocess_regexps = \ preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
br.submit() br.submit()
return br return br
def get_feeds(self): def parse_feeds(self):
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read() src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
soup = BeautifulSoup(src[src.index('<html'):]) soup = BeautifulSoup(src)
feeds = []
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
'Dining & Wine', 'Home & Garden', 'Multimedia',
'Most E-mailed Articles',
'Automobiles', 'Fashion & Style', 'Television News',
'Education']:
feeds.append((link['title'], link['href'].replace('graphics8', 'www')))
return feeds def feed_title(div):
return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline']}):
if div['class'] == 'section-headline':
key = feed_title(div)
articles[key] = []
elif div['class'] in ['story', 'story headline']:
a = div.find('a', href=True)
if not a:
continue
url = self.print_version(a['href'])
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = time.strftime('%a, %d %b', time.localtime())
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate, description=description,
content=''))
return articles
def print_version(self, url): def print_version(self, url):
return url + '?&pagewanted=print' return url + '?&pagewanted=print'