Use today's paper to generate WSJ recipe

This commit is contained in:
Kovid Goyal 2009-12-22 14:07:50 -07:00
parent 2213a4648a
commit f79a63d19c
2 changed files with 51 additions and 56 deletions

View File

@ -3,15 +3,16 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
# http://online.wsj.com/page/us_in_todays_paper.html # http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe): class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal (US)'
__author__ = 'Kovid Goyal and Sujata Raman' __author__ = 'Kovid Goyal and Sujata Raman'
description = 'News and current affairs.' description = 'News and current affairs'
needs_subscription = True needs_subscription = True
language = 'en' language = 'en'
@ -60,60 +61,51 @@ class WallStreetJournal(BasicNewsRecipe):
return soup return soup
def get_article_url(self, article): def wsj_get_index(self):
try: return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html')
return article.feedburner_origlink.split('?')[0]
except AttributeError: def parse_index(self):
return article.link.split('?')[0] soup = self.wsj_get_index()
left_column = soup.find(
text=lambda t: 'begin ITP Left Column' in str(t))
table = left_column.findNext('table')
current_section = None
current_articles = []
feeds = []
for x in table.findAllNext(True):
if x.name == 'td' and x.get('class', None) == 'b13':
if current_articles and current_section:
feeds.append((current_section, current_articles))
current_section = self.tag_to_string(x.a).strip()
current_articles = []
self.log('\tProcessing section:', current_section)
if current_section is not None and x.name == 'a' and \
x.get('class', None) == 'bold80':
title = self.tag_to_string(x)
url = x.get('href', False)
if not url or not title:
continue
url = url.partition('#')[0]
desc = ''
d = x.findNextSibling(True)
if d.get('class', None) == 'arialResize':
desc = self.tag_to_string(d)
desc = desc.partition(u'\u2022')[0]
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
if url.startswith('/'):
url = 'http://online.wsj.com'+url
if desc:
self.log('\t\t\t', desc)
current_articles.append({'title': title, 'url':url,
'description':desc, 'date':''})
return feeds
def cleanup(self): def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
feeds = [
#('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'),
#('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'),
#('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'),
(' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'),
(' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'),
#('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'),
('Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'),
('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'),
('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'),
('Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'),
('Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'),
('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'),
('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'),
('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'),
('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'),
('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'),
('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'),
('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'),
('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'),
('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'),
('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'),
('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'),
('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'),
('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'),
('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'),
('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'),
('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'),
('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'),
('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'),
('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'),
('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'),
('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'),
('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'),
('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'),
('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'),
('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'),
('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'),
('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'),
('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'),
('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'),
('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'),
('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'),
('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'),
('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'),
]

View File

@ -1063,7 +1063,7 @@ class BasicNewsRecipe(Recipe):
return parsed_feeds return parsed_feeds
@classmethod @classmethod
def tag_to_string(self, tag, use_alt=True): def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
''' '''
Convenience method to take a Convenience method to take a
`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
@ -1090,7 +1090,10 @@ class BasicNewsRecipe(Recipe):
strings.append(res) strings.append(res)
elif use_alt and item.has_key('alt'): elif use_alt and item.has_key('alt'):
strings.append(item['alt']) strings.append(item['alt'])
return u''.join(strings) ans = u''.join(strings)
if normalize_whitespace:
ans = re.sub(r'\s+', ' ', ans)
return ans
@classmethod @classmethod
def soup(cls, raw): def soup(cls, raw):