mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Wall Street Journal
This commit is contained in:
parent
0e5279537f
commit
f030b414ea
@ -36,6 +36,7 @@ def classes(classes):
|
|||||||
return dict(attrs={
|
return dict(attrs={
|
||||||
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||||
|
|
||||||
|
|
||||||
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
|
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'
|
||||||
|
|
||||||
|
|
||||||
@ -150,23 +151,26 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
for a in CSSSelect('a.mjLinkItem[href]')(root):
|
for container in root.xpath('//li[contains(@class, "mjItemMain")]'):
|
||||||
container = a.xpath('ancestor::li')
|
meta = container.xpath('descendant::span[@class="meta_sectionName"]')
|
||||||
meta = CSSSelect('.meta_sectionName')(a)
|
if not meta:
|
||||||
if meta:
|
continue
|
||||||
meta = meta[0]
|
meta = meta[0]
|
||||||
meta.getparent().remove(meta)
|
a = meta.xpath('ancestor::a')[0]
|
||||||
meta = self.tag_to_string(meta)
|
meta.getparent().remove(meta)
|
||||||
|
meta = self.tag_to_string(meta)
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
if meta:
|
if meta:
|
||||||
title += ' [%s]' % meta
|
title += ' [%s]' % meta
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
desc = ''
|
desc = ''
|
||||||
if container:
|
if container:
|
||||||
for p in CSSSelect('p')(container[0]):
|
for p in container.xpath('descendant::p'):
|
||||||
desc = self.tag_to_string(p)
|
q = self.tag_to_string(p)
|
||||||
if 'Subscriber Content' not in desc:
|
if 'Subscriber Content' in q:
|
||||||
break
|
continue
|
||||||
|
desc += q
|
||||||
|
break
|
||||||
|
|
||||||
articles.append({'title': title, 'url': url,
|
articles.append({'title': title, 'url': url,
|
||||||
'description': desc, 'date': ''})
|
'description': desc, 'date': ''})
|
||||||
@ -217,14 +221,15 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return articles
|
return articles
|
||||||
|
|
||||||
def wsj_add_feed(self, feeds, title, url):
|
def wsj_add_feed(self, feeds, title, url):
|
||||||
self.log('Found section:', title)
|
self.log('Found section:', title, '[' + url + ']')
|
||||||
try:
|
try:
|
||||||
if url.endswith('whatsnews'):
|
if url.endswith('whatsnews'):
|
||||||
articles = self.wsj_find_wn_articles(url)
|
articles = self.wsj_find_wn_articles(url)
|
||||||
else:
|
else:
|
||||||
articles = self.wsj_find_articles(
|
articles = self.wsj_find_articles(
|
||||||
url, ahed=title == 'Front Section')
|
url, ahed=title == 'Front Section')
|
||||||
except:
|
except Exception:
|
||||||
|
self.log.exception('Failed to parse section:', title)
|
||||||
articles = []
|
articles = []
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
@ -109,23 +109,26 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
for a in CSSSelect('a.mjLinkItem[href]')(root):
|
for container in root.xpath('//li[contains(@class, "mjItemMain")]'):
|
||||||
container = a.xpath('ancestor::li')
|
meta = container.xpath('descendant::span[@class="meta_sectionName"]')
|
||||||
meta = CSSSelect('.meta_sectionName')(a)
|
if not meta:
|
||||||
if meta:
|
continue
|
||||||
meta = meta[0]
|
meta = meta[0]
|
||||||
meta.getparent().remove(meta)
|
a = meta.xpath('ancestor::a')[0]
|
||||||
meta = self.tag_to_string(meta)
|
meta.getparent().remove(meta)
|
||||||
|
meta = self.tag_to_string(meta)
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
if meta:
|
if meta:
|
||||||
title += ' [%s]' % meta
|
title += ' [%s]' % meta
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
desc = ''
|
desc = ''
|
||||||
if container:
|
if container:
|
||||||
for p in CSSSelect('p')(container[0]):
|
for p in container.xpath('descendant::p'):
|
||||||
desc = self.tag_to_string(p)
|
q = self.tag_to_string(p)
|
||||||
if 'Subscriber Content' not in desc:
|
if 'Subscriber Content' in q:
|
||||||
break
|
continue
|
||||||
|
desc += q
|
||||||
|
break
|
||||||
|
|
||||||
articles.append({'title': title, 'url': url,
|
articles.append({'title': title, 'url': url,
|
||||||
'description': desc, 'date': ''})
|
'description': desc, 'date': ''})
|
||||||
@ -176,14 +179,15 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return articles
|
return articles
|
||||||
|
|
||||||
def wsj_add_feed(self, feeds, title, url):
|
def wsj_add_feed(self, feeds, title, url):
|
||||||
self.log('Found section:', title)
|
self.log('Found section:', title, '[' + url + ']')
|
||||||
try:
|
try:
|
||||||
if url.endswith('whatsnews'):
|
if url.endswith('whatsnews'):
|
||||||
articles = self.wsj_find_wn_articles(url)
|
articles = self.wsj_find_wn_articles(url)
|
||||||
else:
|
else:
|
||||||
articles = self.wsj_find_articles(
|
articles = self.wsj_find_articles(
|
||||||
url, ahed=title == 'Front Section')
|
url, ahed=title == 'Front Section')
|
||||||
except:
|
except Exception:
|
||||||
|
self.log.exception('Failed to parse section:', title)
|
||||||
articles = []
|
articles = []
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((title, articles))
|
feeds.append((title, articles))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user