Update The Wall Street Journal

This commit is contained in:
Kovid Goyal 2019-06-25 20:19:52 +05:30
parent f663fff3ad
commit 43f8e0926d
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 15 additions and 11 deletions

View File

@ -5,11 +5,11 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
try: try:
from urllib.parse import quote from urllib.parse import quote
except ImportError: except ImportError:
from urllib import quote from urllib import quote
from mechanize import Request from mechanize import Request
from calibre import random_user_agent from calibre import random_user_agent
@ -150,13 +150,13 @@ class WSJ(BasicNewsRecipe):
root = self.index_to_soup(url, as_tree=True) root = self.index_to_soup(url, as_tree=True)
CSSSelect = Select(root) CSSSelect = Select(root)
articles = [] articles = []
for container in root.xpath('descendant::div[contains(@class, "WSJTheme__list-item_")]'): for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
heading = next(CSSSelect('h2, h3', container)) heading = next(CSSSelect('h2, h3', container))
a = next(CSSSelect('a', heading)) a = next(CSSSelect('a', heading))
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
desc = '' desc = ''
for p in container.xpath('descendant::p[contains(@class, "WSJTheme__description_")]'): for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
q = self.tag_to_string(p) q = self.tag_to_string(p)
if 'Subscriber Content' in q: if 'Subscriber Content' in q:
continue continue
@ -173,7 +173,7 @@ class WSJ(BasicNewsRecipe):
def wsj_find_wn_articles(self, feeds, root, CSSSelect): def wsj_find_wn_articles(self, feeds, root, CSSSelect):
articles = [] articles = []
for a in CSSSelect('.style__strap_2m6gCW_c_6WZKkU--eRUWv'): for a in CSSSelect('.style--strap--3DsLojSy'):
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
whats_news = a.getparent() whats_news = a.getparent()
break break
@ -196,7 +196,6 @@ class WSJ(BasicNewsRecipe):
return articles return articles
def wsj_add_feed(self, feeds, title, url): def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title, '[' + url + ']')
try: try:
articles = self.wsj_find_articles(url) articles = self.wsj_find_articles(url)
if not articles: if not articles:
@ -212,18 +211,21 @@ class WSJ(BasicNewsRecipe):
# return self.test_wsj_index() # return self.test_wsj_index()
root = self.index_to_soup(self.wsj_itp_page, as_tree=True) root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
CSSSelect = Select(root) CSSSelect = Select(root)
# from calibre.utils.ipython import ipython
# ipython({'root': root, 'CSSSelect': CSSSelect, 'raw': self.wsj_itp_page})
for inp in CSSSelect('.DayPickerInput > input'): for inp in CSSSelect('.DayPickerInput > input'):
if inp.get('placeholder'): if inp.get('placeholder'):
self.timefmt = inp.get('placeholder') self.timefmt = inp.get('placeholder')
break break
feeds = [] feeds = []
for a in CSSSelect('.WSJTheme__nav-container_sPVwT3FiPlWjFGtr5KH3d .WSJTheme__section-link_XGDsdx5qPlnC8BZPxQ63R'): for a in CSSSelect('.WSJTheme--nav-container--sPVwT3Fi .WSJTheme--section-link--XGDsdx5q'):
frontpage = a.get('href').endswith('frontpage') frontpage = a.get('href').endswith('frontpage')
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title: if not title:
continue continue
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
if frontpage: if frontpage:
self.wsj_find_wn_articles(feeds, root, CSSSelect) self.wsj_find_wn_articles(feeds, root, CSSSelect)

View File

@ -150,13 +150,13 @@ class WSJ(BasicNewsRecipe):
root = self.index_to_soup(url, as_tree=True) root = self.index_to_soup(url, as_tree=True)
CSSSelect = Select(root) CSSSelect = Select(root)
articles = [] articles = []
for container in root.xpath('descendant::div[contains(@class, "WSJTheme__list-item_")]'): for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
heading = next(CSSSelect('h2, h3', container)) heading = next(CSSSelect('h2, h3', container))
a = next(CSSSelect('a', heading)) a = next(CSSSelect('a', heading))
title = self.tag_to_string(a) title = self.tag_to_string(a)
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
desc = '' desc = ''
for p in container.xpath('descendant::p[contains(@class, "WSJTheme__description_")]'): for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
q = self.tag_to_string(p) q = self.tag_to_string(p)
if 'Subscriber Content' in q: if 'Subscriber Content' in q:
continue continue
@ -173,7 +173,7 @@ class WSJ(BasicNewsRecipe):
def wsj_find_wn_articles(self, feeds, root, CSSSelect): def wsj_find_wn_articles(self, feeds, root, CSSSelect):
articles = [] articles = []
for a in CSSSelect('.style__strap_2m6gCW_c_6WZKkU--eRUWv'): for a in CSSSelect('.style--strap--3DsLojSy'):
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper(): if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
whats_news = a.getparent() whats_news = a.getparent()
break break
@ -196,7 +196,6 @@ class WSJ(BasicNewsRecipe):
return articles return articles
def wsj_add_feed(self, feeds, title, url): def wsj_add_feed(self, feeds, title, url):
self.log('Found section:', title, '[' + url + ']')
try: try:
articles = self.wsj_find_articles(url) articles = self.wsj_find_articles(url)
if not articles: if not articles:
@ -212,18 +211,21 @@ class WSJ(BasicNewsRecipe):
# return self.test_wsj_index() # return self.test_wsj_index()
root = self.index_to_soup(self.wsj_itp_page, as_tree=True) root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
CSSSelect = Select(root) CSSSelect = Select(root)
# from calibre.utils.ipython import ipython
# ipython({'root': root, 'CSSSelect': CSSSelect, 'raw': self.wsj_itp_page})
for inp in CSSSelect('.DayPickerInput > input'): for inp in CSSSelect('.DayPickerInput > input'):
if inp.get('placeholder'): if inp.get('placeholder'):
self.timefmt = inp.get('placeholder') self.timefmt = inp.get('placeholder')
break break
feeds = [] feeds = []
for a in CSSSelect('.WSJTheme__nav-container_sPVwT3FiPlWjFGtr5KH3d .WSJTheme__section-link_XGDsdx5qPlnC8BZPxQ63R'): for a in CSSSelect('.WSJTheme--nav-container--sPVwT3Fi .WSJTheme--section-link--XGDsdx5q'):
frontpage = a.get('href').endswith('frontpage') frontpage = a.get('href').endswith('frontpage')
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.') title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
if not title: if not title:
continue continue
url = self.abs_wsj_url(a.get('href')) url = self.abs_wsj_url(a.get('href'))
self.log('Found section:', title, 'at', url)
self.wsj_add_feed(feeds, title, url) self.wsj_add_feed(feeds, title, url)
if frontpage: if frontpage:
self.wsj_find_wn_articles(feeds, root, CSSSelect) self.wsj_find_wn_articles(feeds, root, CSSSelect)