From 568726db1fb3b351a97b843105d97ed00ce2d5bc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 11 Oct 2015 00:09:43 +0530 Subject: [PATCH] Update Wall Street Journal --- recipes/wsj.recipe | 18 ++++++++++++++++-- recipes/wsj_free.recipe | 17 +++++++++++++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index de928489a9..96fa96b246 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -80,7 +80,7 @@ class WSJ(JavascriptRecipe): href = 'http://online.wsj.com' + href return href - def wsj_find_articles(self, url): + def wsj_find_articles(self, url, ahed=False): root = self.index_to_soup(url) for x in CSSSelect('div.whatsNews-simple')(root): @@ -111,6 +111,20 @@ class WSJ(JavascriptRecipe): self.log('\tFound article:', title) self.log('\t\t', desc) + + if ahed: + for h2 in root.xpath('//li[@class="ahed_listitem"]/h2'): + a = h2.xpath('descendant::a')[0] + title = self.tag_to_string(a) + url = self.abs_wsj_url(a.get('href')) + desc = '' + p = h2.xpath('following-sibling::p') + if p: + desc = self.tag_to_string(p[0]) + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + self.log('Found article:', title) + self.log('\t\t', desc) + return articles def wsj_find_wn_articles(self, url): @@ -145,7 +159,7 @@ class WSJ(JavascriptRecipe): if url.endswith('whatsnews'): articles = self.wsj_find_wn_articles(url) else: - articles = self.wsj_find_articles(url) + articles = self.wsj_find_articles(url, ahed=title == 'Front Section') except: articles = [] if articles: diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 09c13aa1b4..b59942ddd5 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -75,7 +75,7 @@ class WSJ(JavascriptRecipe): href = 'http://online.wsj.com' + href return href - def wsj_find_articles(self, url): + def wsj_find_articles(self, url, ahed=False): root = self.index_to_soup(url) for x in CSSSelect('div.whatsNews-simple')(root): @@ -106,6 +106,19 @@ class WSJ(JavascriptRecipe): self.log('\tFound article:', title) self.log('\t\t', desc) + if ahed: + for h2 in root.xpath('//li[@class="ahed_listitem"]/h2'): + a = h2.xpath('descendant::a')[0] + title = self.tag_to_string(a) + url = self.abs_wsj_url(a.get('href')) + desc = '' + p = h2.xpath('following-sibling::p') + if p: + desc = self.tag_to_string(p[0]) + articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) + self.log('Found article:', title) + self.log('\t\t', desc) + return articles def wsj_find_wn_articles(self, url): @@ -140,7 +153,7 @@ class WSJ(JavascriptRecipe): if url.endswith('whatsnews'): articles = self.wsj_find_wn_articles(url) else: - articles = self.wsj_find_articles(url) + articles = self.wsj_find_articles(url, ahed=title == 'Front Section') except: articles = [] if articles: