Update Wall Street Journal

This commit is contained in:
Kovid Goyal 2017-02-05 21:21:40 +05:30
parent 0e5279537f
commit f030b414ea
2 changed files with 35 additions and 26 deletions

View File

@ -36,6 +36,7 @@ def classes(classes):
return dict(attrs={ return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)}) 'class': lambda x: x and frozenset(x.split()).intersection(q)})
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0' USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
@ -150,23 +151,26 @@ class WSJ(BasicNewsRecipe):
articles = [] articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root): for container in root.xpath('//li[contains(@class, "mjItemMain")]'):
container = a.xpath('ancestor::li') meta = container.xpath('descendant::span[@class="meta_sectionName"]')
meta = CSSSelect('.meta_sectionName')(a) if not meta:
if meta: continue
meta = meta[0] meta = meta[0]
meta.getparent().remove(meta) a = meta.xpath('ancestor::a')[0]
meta = self.tag_to_string(meta) meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a) title = self.tag_to_string(a)
if meta: if meta:
title += ' [%s]' % meta title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
desc = '' desc = ''
if container: if container:
for p in CSSSelect('p')(container[0]): for p in container.xpath('descendant::p'):
desc = self.tag_to_string(p) q = self.tag_to_string(p)
if 'Subscriber Content' not in desc: if 'Subscriber Content' in q:
break continue
desc += q
break
articles.append({'title': title, 'url': url, articles.append({'title': title, 'url': url,
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
@ -217,14 +221,15 @@ class WSJ(BasicNewsRecipe):
return articles return articles
def wsj_add_feed(self, feeds, title, url): def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title) self.log('Found section:', title, '[' + url + ']')
try: try:
if url.endswith('whatsnews'): if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url) articles = self.wsj_find_wn_articles(url)
else: else:
articles = self.wsj_find_articles( articles = self.wsj_find_articles(
url, ahed=title == 'Front Section') url, ahed=title == 'Front Section')
except: except Exception:
self.log.exception('Failed to parse section:', title)
articles = [] articles = []
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))

View File

@ -109,23 +109,26 @@ class WSJ(BasicNewsRecipe):
articles = [] articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root): for container in root.xpath('//li[contains(@class, "mjItemMain")]'):
container = a.xpath('ancestor::li') meta = container.xpath('descendant::span[@class="meta_sectionName"]')
meta = CSSSelect('.meta_sectionName')(a) if not meta:
if meta: continue
meta = meta[0] meta = meta[0]
meta.getparent().remove(meta) a = meta.xpath('ancestor::a')[0]
meta = self.tag_to_string(meta) meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a) title = self.tag_to_string(a)
if meta: if meta:
title += ' [%s]' % meta title += ' [%s]' % meta
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
desc = '' desc = ''
if container: if container:
for p in CSSSelect('p')(container[0]): for p in container.xpath('descendant::p'):
desc = self.tag_to_string(p) q = self.tag_to_string(p)
if 'Subscriber Content' not in desc: if 'Subscriber Content' in q:
break continue
desc += q
break
articles.append({'title': title, 'url': url, articles.append({'title': title, 'url': url,
'description': desc, 'date': ''}) 'description': desc, 'date': ''})
@ -176,14 +179,15 @@ class WSJ(BasicNewsRecipe):
return articles return articles
def wsj_add_feed(self, feeds, title, url): def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title) self.log('Found section:', title, '[' + url + ']')
try: try:
if url.endswith('whatsnews'): if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url) articles = self.wsj_find_wn_articles(url)
else: else:
articles = self.wsj_find_articles( articles = self.wsj_find_articles(
url, ahed=title == 'Front Section') url, ahed=title == 'Front Section')
except: except Exception:
self.log.exception('Failed to parse section:', title)
articles = [] articles = []
if articles: if articles:
feeds.append((title, articles)) feeds.append((title, articles))