mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Improved NY Times profile that corresponds to the daily paper
This commit is contained in:
parent
1ddcfb5844
commit
ec79457d45
@ -15,7 +15,7 @@
|
||||
'''
|
||||
Profile to download the New York Times
|
||||
'''
|
||||
import re
|
||||
import re, time
|
||||
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
needs_subscription = True
|
||||
max_recursions = 2
|
||||
recommended_frequency = 1
|
||||
encoding = 'cp1252'
|
||||
html2lrf_options = ['--base-font-size=0']
|
||||
|
||||
|
||||
preprocess_regexps = \
|
||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||
@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def get_feeds(self):
|
||||
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
|
||||
soup = BeautifulSoup(src[src.index('<html'):])
|
||||
feeds = []
|
||||
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
|
||||
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
|
||||
'Dining & Wine', 'Home & Garden', 'Multimedia',
|
||||
'Most E-mailed Articles',
|
||||
'Automobiles', 'Fashion & Style', 'Television News',
|
||||
'Education']:
|
||||
feeds.append((link['title'], link['href'].replace('graphics8', 'www')))
|
||||
def parse_feeds(self):
|
||||
src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
|
||||
soup = BeautifulSoup(src)
|
||||
|
||||
return feeds
|
||||
def feed_title(div):
|
||||
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||
|
||||
articles = {}
|
||||
key = None
|
||||
for div in soup.findAll(True,
|
||||
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||
|
||||
if div['class'] == 'section-headline':
|
||||
key = feed_title(div)
|
||||
articles[key] = []
|
||||
|
||||
elif div['class'] in ['story', 'story headline']:
|
||||
a = div.find('a', href=True)
|
||||
if not a:
|
||||
continue
|
||||
url = self.print_version(a['href'])
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description = ''
|
||||
pubdate = time.strftime('%a, %d %b', time.localtime())
|
||||
summary = div.find(True, attrs={'class':'summary'})
|
||||
if summary:
|
||||
description = self.tag_to_string(summary, use_alt=False)
|
||||
|
||||
feed = key if key is not None else 'Uncategorized'
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
articles[feed].append(
|
||||
dict(title=title, url=url, date=pubdate, description=description,
|
||||
content=''))
|
||||
|
||||
|
||||
return articles
|
||||
|
||||
def print_version(self, url):
|
||||
return url + '?&pagewanted=print'
|
||||
|
Loading…
x
Reference in New Issue
Block a user