Update Wall Street Journal

This commit is contained in:
Kovid Goyal 2017-02-05 21:21:40 +05:30
parent 0e5279537f
commit f030b414ea
2 changed files with 35 additions and 26 deletions

View File

@ -36,6 +36,7 @@ def classes(classes):
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
@ -150,11 +151,12 @@ class WSJ(BasicNewsRecipe):
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
for container in root.xpath('//li[contains(@class, "mjItemMain")]'):
meta = container.xpath('descendant::span[@class="meta_sectionName"]')
if not meta:
continue
meta = meta[0]
a = meta.xpath('ancestor::a')[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
@ -163,9 +165,11 @@ class WSJ(BasicNewsRecipe):
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
for p in container.xpath('descendant::p'):
q = self.tag_to_string(p)
if 'Subscriber Content' in q:
continue
desc += q
break
articles.append({'title': title, 'url': url,
@ -217,14 +221,15 @@ class WSJ(BasicNewsRecipe):
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title)
self.log('Found section:', title, '[' + url + ']')
try:
if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url)
else:
articles = self.wsj_find_articles(
url, ahed=title == 'Front Section')
except:
except Exception:
self.log.exception('Failed to parse section:', title)
articles = []
if articles:
feeds.append((title, articles))

View File

@ -109,11 +109,12 @@ class WSJ(BasicNewsRecipe):
articles = []
for a in CSSSelect('a.mjLinkItem[href]')(root):
container = a.xpath('ancestor::li')
meta = CSSSelect('.meta_sectionName')(a)
if meta:
for container in root.xpath('//li[contains(@class, "mjItemMain")]'):
meta = container.xpath('descendant::span[@class="meta_sectionName"]')
if not meta:
continue
meta = meta[0]
a = meta.xpath('ancestor::a')[0]
meta.getparent().remove(meta)
meta = self.tag_to_string(meta)
title = self.tag_to_string(a)
@ -122,9 +123,11 @@ class WSJ(BasicNewsRecipe):
url = self.abs_wsj_url(a.get('href'))
desc = ''
if container:
for p in CSSSelect('p')(container[0]):
desc = self.tag_to_string(p)
if 'Subscriber Content' not in desc:
for p in container.xpath('descendant::p'):
q = self.tag_to_string(p)
if 'Subscriber Content' in q:
continue
desc += q
break
articles.append({'title': title, 'url': url,
@ -176,14 +179,15 @@ class WSJ(BasicNewsRecipe):
return articles
def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title)
self.log('Found section:', title, '[' + url + ']')
try:
if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url)
else:
articles = self.wsj_find_articles(
url, ahed=title == 'Front Section')
except:
except Exception:
self.log.exception('Failed to parse section:', title)
articles = []
if articles:
feeds.append((title, articles))