diff --git a/resources/recipes/wsj.recipe b/resources/recipes/wsj.recipe index 513d0a7024..e27b34c08d 100644 --- a/resources/recipes/wsj.recipe +++ b/resources/recipes/wsj.recipe @@ -3,15 +3,16 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' + from calibre.web.feeds.news import BasicNewsRecipe # http://online.wsj.com/page/us_in_todays_paper.html class WallStreetJournal(BasicNewsRecipe): - title = 'The Wall Street Journal' + title = 'The Wall Street Journal (US)' __author__ = 'Kovid Goyal and Sujata Raman' - description = 'News and current affairs.' + description = 'News and current affairs' needs_subscription = True language = 'en' @@ -60,60 +61,51 @@ class WallStreetJournal(BasicNewsRecipe): return soup - def get_article_url(self, article): - try: - return article.feedburner_origlink.split('?')[0] - except AttributeError: - return article.link.split('?')[0] + def wsj_get_index(self): + return self.index_to_soup('http://online.wsj.com/page/us_in_todays_paper.html') + + def parse_index(self): + soup = self.wsj_get_index() + + left_column = soup.find( + text=lambda t: 'begin ITP Left Column' in str(t)) + + table = left_column.findNext('table') + + current_section = None + current_articles = [] + feeds = [] + for x in table.findAllNext(True): + if x.name == 'td' and x.get('class', None) == 'b13': + if current_articles and current_section: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x.a).strip() + current_articles = [] + self.log('\tProcessing section:', current_section) + if current_section is not None and x.name == 'a' and \ + x.get('class', None) == 'bold80': + title = self.tag_to_string(x) + url = x.get('href', False) + if not url or not title: + continue + url = url.partition('#')[0] + desc = '' + d = x.findNextSibling(True) + if d.get('class', None) == 'arialResize': + desc = self.tag_to_string(d) + desc = desc.partition(u'\u2022')[0] + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + if url.startswith('/'): + url = 'http://online.wsj.com'+url + if desc: + self.log('\t\t\t', desc) + current_articles.append({'title': title, 'url':url, + 'description':desc, 'date':''}) + + return feeds def cleanup(self): self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com') - feeds = [ - #('Most Emailed - Day', 'http://online.wsj.com/xml/rss/3_7030.xml'), - #('Most Emailed - Week', 'http://online.wsj.com/xml/rss/3_7253.xml'), - #('Most Emailed - Month', 'http://online.wsj.com/xml/rss/3_7254.xml'), - (' Most Viewed - Day', 'http://online.wsj.com/xml/rss/3_7198.xml'), - (' Most Viewed - Week', 'http://online.wsj.com/xml/rss/3_7251.xml'), - #('Most Viewed - Month', 'http://online.wsj.com/xml/rss/3_7252.xml'), - ('Today\'s Newspaper - Page One', 'http://online.wsj.com/xml/rss/3_7205.xml'), - ('Today\'s Newspaper - Marketplace', 'http://online.wsj.com/xml/rss/3_7206.xml'), - ('Today\'s Newspaper - Money & Investing', 'http://online.wsj.com/xml/rss/3_7207.xml'), - ('Today\'s Newspaper - Personal Journal', 'http://online.wsj.com/xml/rss/3_7208.xml'), - ('Today\'s Newspaper - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7209.xml'), - ('Opinion', 'http://online.wsj.com/xml/rss/3_7041.xml'), - ('News - U.S.: What\'s News', 'http://online.wsj.com/xml/rss/3_7011.xml'), - ('News - U.S. Business', 'http://online.wsj.com/xml/rss/3_7014.xml'), - ('News - Europe: What\'s News', 'http://online.wsj.com/xml/rss/3_7012.xml'), - ('News - Asia: What\'s News', 'http://online.wsj.com/xml/rss/3_7013.xml'), - ('News - World News', 'http://online.wsj.com/xml/rss/3_7085.xml'), - ('News - Economy', 'http://online.wsj.com/xml/rss/3_7086.xml'), - ('News - Earnings', 'http://online.wsj.com/xml/rss/3_7088.xml'), - ('News - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'), - ('News - Law', 'http://online.wsj.com/xml/rss/3_7091.xml'), - ('News - Media & Marketing', 'http://online.wsj.com/xml/rss/3_7020.xml'), - ('Technology - What\'s News', 'http://online.wsj.com/xml/rss/3_7015.xml'), - ('Technology - Gadgets', 'http://online.wsj.com/xml/rss/3_7094.xml'), - ('Technology - Telecommunications', 'http://online.wsj.com/xml/rss/3_7095.xml'), - ('Technology - E-commerce/Media', 'http://online.wsj.com/xml/rss/3_7096.xml'), - ('Technology - Asia', 'http://online.wsj.com/xml/rss/3_7097.xml'), - ('Technology - Europe', 'http://online.wsj.com/xml/rss/3_7098.xml'), - ('Markets - News', 'http://online.wsj.com/xml/rss/3_7031.xml'), - ('Markets - Europe News', 'http://online.wsj.com/xml/rss/3_7101.xml'), - ('Markets - Asia News', 'http://online.wsj.com/xml/rss/3_7102.xml'), - ('Markets - Deals & Deal Makers', 'http://online.wsj.com/xml/rss/3_7099.xml'), - ('Markets - Hedge Funds', 'http://online.wsj.com/xml/rss/3_7199.xml'), - ('Personal Journal', 'http://online.wsj.com/xml/rss/3_7200.xml'), - ('Personal Journal - Money', 'http://online.wsj.com/xml/rss/3_7104.xml'), - ('Personal Journal - Health', 'http://online.wsj.com/xml/rss/3_7089.xml'), - ('Personal Journal - Autos', 'http://online.wsj.com/xml/rss/3_7092.xml'), - ('Personal Journal - Homes', 'http://online.wsj.com/xml/rss/3_7105.xml'), - ('Personal Journal - Travel', 'http://online.wsj.com/xml/rss/3_7106.xml'), - ('Personal Journal - Careers', 'http://online.wsj.com/xml/rss/3_7107.xml'), - ('Weekend & Leisure', 'http://online.wsj.com/xml/rss/3_7201.xml'), - ('Weekend & Leisure - Weekend Journal', 'http://online.wsj.com/xml/rss/3_7202.xml'), - ('Weekend & Leisure - Arts & Entertainment', 'http://online.wsj.com/xml/rss/3_7177.xml'), - ('Weekend & Leisure - Books', 'http://online.wsj.com/xml/rss/3_7203.xml'), - ('Weekend & Leisure - Sports', 'http://online.wsj.com/xml/rss/3_7204.xml'), - ] diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index b2d0d4d2ce..bffe0b7f34 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -1063,7 +1063,7 @@ class BasicNewsRecipe(Recipe): return parsed_feeds @classmethod - def tag_to_string(self, tag, use_alt=True): + def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True): ''' Convenience method to take a `BeautifulSoup `_ @@ -1090,7 +1090,10 @@ class BasicNewsRecipe(Recipe): strings.append(res) elif use_alt and item.has_key('alt'): strings.append(item['alt']) - return u''.join(strings) + ans = u''.join(strings) + if normalize_whitespace: + ans = re.sub(r'\s+', ' ', ans) + return ans @classmethod def soup(cls, raw):