mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Improved NY Times profile that corresponds to the daily paper
This commit is contained in:
parent
1ddcfb5844
commit
ec79457d45
@ -15,7 +15,7 @@
|
|||||||
'''
|
'''
|
||||||
Profile to download the New York Times
|
Profile to download the New York Times
|
||||||
'''
|
'''
|
||||||
import re
|
import re, time
|
||||||
|
|
||||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
|
||||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
|
|||||||
timefmt = ' [%a, %d %b, %Y]'
|
timefmt = ' [%a, %d %b, %Y]'
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
max_recursions = 2
|
max_recursions = 2
|
||||||
|
recommended_frequency = 1
|
||||||
|
encoding = 'cp1252'
|
||||||
|
html2lrf_options = ['--base-font-size=0']
|
||||||
|
|
||||||
|
|
||||||
preprocess_regexps = \
|
preprocess_regexps = \
|
||||||
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
||||||
@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
|
|||||||
br.submit()
|
br.submit()
|
||||||
return br
|
return br
|
||||||
|
|
||||||
def get_feeds(self):
|
def parse_feeds(self):
|
||||||
src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
|
src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
|
||||||
soup = BeautifulSoup(src[src.index('<html'):])
|
soup = BeautifulSoup(src)
|
||||||
feeds = []
|
|
||||||
for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
|
|
||||||
if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts',
|
|
||||||
'Dining & Wine', 'Home & Garden', 'Multimedia',
|
|
||||||
'Most E-mailed Articles',
|
|
||||||
'Automobiles', 'Fashion & Style', 'Television News',
|
|
||||||
'Education']:
|
|
||||||
feeds.append((link['title'], link['href'].replace('graphics8', 'www')))
|
|
||||||
|
|
||||||
return feeds
|
def feed_title(div):
|
||||||
|
return ''.join(div.findAll(text=True, recursive=False)).strip()
|
||||||
|
|
||||||
|
articles = {}
|
||||||
|
key = None
|
||||||
|
for div in soup.findAll(True,
|
||||||
|
attrs={'class':['section-headline', 'story', 'story headline']}):
|
||||||
|
|
||||||
|
if div['class'] == 'section-headline':
|
||||||
|
key = feed_title(div)
|
||||||
|
articles[key] = []
|
||||||
|
|
||||||
|
elif div['class'] in ['story', 'story headline']:
|
||||||
|
a = div.find('a', href=True)
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
url = self.print_version(a['href'])
|
||||||
|
title = self.tag_to_string(a, use_alt=True).strip()
|
||||||
|
description = ''
|
||||||
|
pubdate = time.strftime('%a, %d %b', time.localtime())
|
||||||
|
summary = div.find(True, attrs={'class':'summary'})
|
||||||
|
if summary:
|
||||||
|
description = self.tag_to_string(summary, use_alt=False)
|
||||||
|
|
||||||
|
feed = key if key is not None else 'Uncategorized'
|
||||||
|
if not articles.has_key(feed):
|
||||||
|
articles[feed] = []
|
||||||
|
articles[feed].append(
|
||||||
|
dict(title=title, url=url, date=pubdate, description=description,
|
||||||
|
content=''))
|
||||||
|
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
||||||
def print_version(self, url):
|
def print_version(self, url):
|
||||||
return url + '?&pagewanted=print'
|
return url + '?&pagewanted=print'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user