#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe import copy class WallStreetJournal(BasicNewsRecipe): title = 'Wall Street Journal (free)' __author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17' description = '''News and current affairs. This recipe only fetches complete versions of the articles that are available free on the wsj.com website. To get the rest of the articles, subscribe to the WSJ and use the other WSJ recipe.''' language = 'en' cover_url = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG' max_articles_per_feed = 1000 timefmt = ' [%a, %b %d, %Y]' no_stylesheets = True extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; } h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;} .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small } .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif} .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small} .tagline {color:#333333; font-size:xx-small} .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif} h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small} h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; } .paperLocation{color:#666666; font-size:xx-small}''' remove_tags_before = dict(name='h1') remove_tags = [ dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]), {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]}, dict(name='div', attrs={'data-flash-settings':True}), {'class':['insetContent embedType-interactive insetCol3wide','insetCol6wide','insettipUnit']}, dict(rel='shortcut icon'), ] remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},] def postprocess_html(self, soup, first): for tag in soup.findAll(name=['table', 'tr', 'td']): tag.name = 'div' for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])): tag.extract() return soup def wsj_get_index(self): return self.index_to_soup('http://online.wsj.com/itp') def wsj_add_feed(self,feeds,title,url): self.log('Found section:', title) try: if url.endswith('whatsnews'): articles = self.wsj_find_wn_articles(url) else: articles = self.wsj_find_articles(url) except: articles = [] if articles: feeds.append((title, articles)) return feeds def parse_index(self): soup = self.wsj_get_index() date = soup.find('span', attrs={'class':'date-date'}) if date is not None: self.timefmt = ' [%s]'%self.tag_to_string(date) feeds = [] div = soup.find('div', attrs={'class':'itpHeader'}) div = div.find('ul', attrs={'class':'tab'}) for a in div.findAll('a', href=lambda x: x and '/itp/' in x): pageone = a['href'].endswith('pageone') if pageone: title = 'Front Section' url = 'http://online.wsj.com' + a['href'] feeds = self.wsj_add_feed(feeds,title,url) title = 'What''s News' url = url.replace('pageone','whatsnews') feeds = self.wsj_add_feed(feeds,title,url) else: title = self.tag_to_string(a) url = 'http://online.wsj.com' + a['href'] feeds = self.wsj_add_feed(feeds,title,url) return feeds def wsj_find_wn_articles(self, url): soup = self.index_to_soup(url) articles = [] whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x}) if whats_news is not None: for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x): container = a.findParent(['p']) meta = a.find(attrs={'class':'meta_sectionName'}) if meta is not None: meta.extract() title = self.tag_to_string(a).strip() url = a['href'] desc = '' if container is not None: desc = self.tag_to_string(container) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) self.log('\tFound WN article:', title) return articles def wsj_find_articles(self, url): soup = self.index_to_soup(url) whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x}) if whats_news is not None: whats_news.extract() articles = [] flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x}) if flavorarea is not None: flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article')) if flavorstory is not None: flavorstory['class'] = 'mjLinkItem' metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x}) if metapage is not None: flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True): container = a.findParent(['li', 'div']) meta = a.find(attrs={'class':'meta_sectionName'}) if meta is not None: meta.extract() meta = self.tag_to_string(meta).strip() if meta: title = self.tag_to_string(a).strip() + ' [%s]'%meta else: title = self.tag_to_string(a).strip() url = 'http://online.wsj.com'+a['href'] desc = '' for p in container.findAll('p'): desc = self.tag_to_string(p) if not 'Subscriber Content' in desc: break articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) self.log('\tFound article:', title) return articles