mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Use today's paper to generate WSJ recipe
This commit is contained in:
parent
2213a4648a
commit
f79a63d19c
@ -3,15 +3,16 @@ __license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
# http://online.wsj.com/page/us_in_todays_paper.html
|
||||
|
||||
class WallStreetJournal(BasicNewsRecipe):
|
||||
|
||||
title = 'The Wall Street Journal'
|
||||
title = 'The Wall Street Journal (US)'
|
||||
__author__ = 'Kovid Goyal and Sujata Raman'
|
||||
description = 'News and current affairs.'
|
||||
description = 'News and current affairs'
|
||||
needs_subscription = True
|
||||
language = 'en'
|
||||
|
||||
@ -60,60 +61,51 @@ class WallStreetJournal(BasicNewsRecipe):
|
||||
|
||||
return soup
|
||||
|
||||
def get_article_url(self, article):
|
||||
try:
|
||||
return article.feedburner_origlink.split('?')[0]
|
||||
except AttributeError:
|
||||
return article.link.split('?')[0]
|
||||
def wsj_get_index(self):
|
||||
return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
|
||||
|
||||
def parse_index(self):
|
||||
soup = self.wsj_get_index()
|
||||
|
||||
left_column = soup.find(
|
||||
text=lambda t: 'begin ITP Left Column' in str(t))
|
||||
|
||||
table = left_column.findNext('table')
|
||||
|
||||
current_section = None
|
||||
current_articles = []
|
||||
feeds = []
|
||||
for x in table.findAllNext(True):
|
||||
if x.name == 'td' and x.get('class', None) == 'b13':
|
||||
if current_articles and current_section:
|
||||
feeds.append((current_section, current_articles))
|
||||
current_section = self.tag_to_string(x.a).strip()
|
||||
current_articles = []
|
||||
self.log('\tProcessing section:', current_section)
|
||||
if current_section is not None and x.name == 'a' and \
|
||||
x.get('class', None) == 'bold80':
|
||||
title = self.tag_to_string(x)
|
||||
url = x.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
url = url.partition('#')[0]
|
||||
desc = ''
|
||||
d = x.findNextSibling(True)
|
||||
if d.get('class', None) == 'arialResize':
|
||||
desc = self.tag_to_string(d)
|
||||
desc = desc.partition(u'\u2022')[0]
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
if url.startswith('/'):
|
||||
url = 'http://online.wsj.com'+url
|
||||
if desc:
|
||||
self.log('\t\t\t', desc)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
'description':desc, 'date':''})
|
||||
|
||||
return feeds
|
||||
|
||||
def cleanup(self):
|
||||
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
|
||||
|
||||
feeds = [
|
||||
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
|
||||
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
|
||||
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
|
||||
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
|
||||
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
|
||||
#('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
|
||||
('Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
|
||||
('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
|
||||
('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
|
||||
('Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
|
||||
('Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
|
||||
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
|
||||
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
|
||||
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
|
||||
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
|
||||
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
|
||||
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
|
||||
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
|
||||
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
|
||||
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
|
||||
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
|
||||
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
|
||||
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
|
||||
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
|
||||
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
|
||||
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
|
||||
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
|
||||
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
|
||||
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
|
||||
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
|
||||
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
|
||||
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
|
||||
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
|
||||
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
|
||||
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
|
||||
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
|
||||
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
|
||||
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
|
||||
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
|
||||
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
|
||||
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
|
||||
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
|
||||
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
|
||||
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
|
||||
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
|
||||
]
|
||||
|
||||
|
@ -1063,7 +1063,7 @@ class BasicNewsRecipe(Recipe):
|
||||
return parsed_feeds
|
||||
|
||||
@classmethod
|
||||
def tag_to_string(self, tag, use_alt=True):
|
||||
def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
|
||||
'''
|
||||
Convenience method to take a
|
||||
`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
|
||||
@ -1090,7 +1090,10 @@ class BasicNewsRecipe(Recipe):
|
||||
strings.append(res)
|
||||
elif use_alt and item.has_key('alt'):
|
||||
strings.append(item['alt'])
|
||||
return u''.join(strings)
|
||||
ans = u''.join(strings)
|
||||
if normalize_whitespace:
|
||||
ans = re.sub(r'\s+', ' ', ans)
|
||||
return ans
|
||||
|
||||
@classmethod
|
||||
def soup(cls, raw):
|
||||
|
Loading…
x
Reference in New Issue
Block a user