from calibre.web.feeds.jsnews import JavascriptRecipe from calibre.web.jsbrowser.browser import NotAFile def CSSSelect(expr): expr = { 'div.whatsNews-simple': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ')]''', 'a.mjLinkItem[href]': '''descendant-or-self::a[@class and contains(concat(' ', normalize-space(@class), ' '), ' mjLinkItem ') and (@href)]''', '.meta_sectionName': '''descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' meta_sectionName ')]''', 'p':'descendant-or-self::p', 'div.whatsNews-simple.whatsNews-itp': '''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-simple ') and (@class and contains(concat(' ', normalize-space(@class), ' '), ' whatsNews-itp '))]''', 'a[href]': 'descendant-or-self::a[@href]', 'span.date-date':"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' date-date ')]", 'div.itpSectionHeaderPdf a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpSectionHeaderPdf ')]/descendant-or-self::*/a[@href]", 'div.itpHeader ul.tab a[href]': "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' itpHeader ')]/descendant-or-self::*/ul[@class and contains(concat(' ', normalize-space(@class), ' '), ' tab ')]/descendant-or-self::*/a[@href]", }[expr] from lxml.etree import XPath return XPath(expr) class WSJ(JavascriptRecipe): title = 'Wall Street Journal (free)' __author__ = 'Kovid Goyal' description = '''News and current affairs. This recipe only fetches complete versions of the articles that are available free on the wsj.com website. To get the rest of the articles, subscribe to the WSJ and use the other WSJ recipe.''' language = 'en' compress_news_images = True compress_news_images_auto_size = 5 max_articles_per_feed = 1000 timefmt = ' [%a, %b %d, %Y]' no_stylesheets = True ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'data-scrim'] keep_only_tags = ( 'h1', # 'h2.subhead', 'h2.subHed.deck', 'span[itemprop=author][rel=author]', 'article#article-contents', 'article#articleBody', 'div#article_story_body', 'header.article_header', # Parallax formatting 'div#ncTitleArea', 'section.nc-exp-artbody', # Error conditions, login required and page not found 'div#snippet-ad-login', 'div.wsj-snippet-body', 'div.wsj-snippet-login', 'div.errorNotFound', ) remove_tags = ( '.insetButton', '.insettipBox', '.author-info', '.media-object-video', '.article_tools', 'span[data-country-code][data-ticker-code]', 'div.nc-exp-artmeta', ) def preprocess_stage2(self, article, browser, url, recursion_level): # Slideshow and expandable images need to be processed here to # set the src attribute correctly found = 0 for img in browser.css_select('img[data-in-base-data-lazy]', all=True): img.setAttribute('src', img.attribute('data-in-base-data-lazy')) found += 1 for img in browser.css_select('img[data-enlarge]', all=True): img.setAttribute('src', img.attribute('data-enlarge')) found += 1 if found: self.log.debug('Found %d dynamic images in:' % found, url) def get_publication_data(self, browser): return self.get_wsj_index(browser) def abs_wsj_url(self, href): if not href.startswith('http'): href = 'http://online.wsj.com' + href return href def wsj_find_articles(self, url): root = self.index_to_soup(url) for x in CSSSelect('div.whatsNews-simple')(root): x.getparent().remove(x) articles = [] for a in CSSSelect('a.mjLinkItem[href]')(root): container = a.xpath('ancestor::li') meta = CSSSelect('.meta_sectionName')(a) if meta: meta = meta[0] meta.getparent().remove(meta) meta = self.tag_to_string(meta) title = self.tag_to_string(a) if meta: title += ' [%s]' % meta url = self.abs_wsj_url(a.get('href')) desc = '' if container: for p in CSSSelect('p')(container[0]): desc = self.tag_to_string(p) if 'Subscriber Content' not in desc: break articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) self.log('\tFound article:', title) self.log('\t\t', desc) return articles def wsj_find_wn_articles(self, url): root = self.index_to_soup(url) articles = [] whats_news = CSSSelect('div.whatsNews-simple.whatsNews-itp')(root) if whats_news: for a in CSSSelect('a[href]')(whats_news[-1]): if '/articles/' not in a.get('href', ''): continue container = a.xpath('ancestor::p') for meta in CSSSelect('.meta_sectionName')(a): meta.getparent().remove(meta) title = self.tag_to_string(a).strip() url = self.abs_wsj_url(a.get('href')) desc = '' if container: desc = self.tag_to_string(container[0]) articles.append({'title': title, 'url': url, 'description': desc, 'date': ''}) self.log('\tFound WN article:', title) self.log('\t\t', desc) return articles def wsj_add_feed(self, feeds, title, url): self.log('Found section:', title) try: if url.endswith('whatsnews'): articles = self.wsj_find_wn_articles(url) else: articles = self.wsj_find_articles(url) except: articles = [] if articles: feeds.append((title, articles)) def get_wsj_index(self, browser): # return self.test_wsj_index() ans = {} root = self.index_to_soup('http://online.wsj.com/itp') for span in CSSSelect('span.date-date')(root): if span.text: self.timefmt = span.text break for a in CSSSelect('div.itpSectionHeaderPdf a[href]')(root): href = a.get('href') try: ans['cover'] = browser.download_file(href) except NotAFile: break break feeds = ans['index'] = [] for a in CSSSelect('div.itpHeader ul.tab a[href]')(root): if '/itp/' not in a.get('href', ''): continue pageone = a.get('href').endswith('pageone') if pageone: title = 'Front Section' url = self.abs_wsj_url(a.get('href')) self.wsj_add_feed(feeds, title, url) title = "What's News" url = url.replace('pageone', 'whatsnews') self.wsj_add_feed(feeds, title, url) else: title = self.tag_to_string(a) url = self.abs_wsj_url(a.get('href')) self.wsj_add_feed(feeds, title, url) return ans def test_wsj_index(self): return {'index': [ ('Testing', [ {'title': 'Article One', 'url': 'http://online.wsj.com/articles/the-end-of-the-impulse-shopper-1416872108'}, # noqa {'title': 'Article Two', 'url': 'http://online.wsj.com/articles/ferguson-police-officer-not-charged-in-black-teens-shooting-1416882438'}, # noqa {'title': 'Article Three', 'url': 'http://online.wsj.com/article/SB10634695869867284248804580297251334393676.html'}, # noqa ]), ]}