mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Update The Wall Street Journal
This commit is contained in:
parent
f663fff3ad
commit
43f8e0926d
@ -5,11 +5,11 @@
|
|||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre import random_user_agent
|
from calibre import random_user_agent
|
||||||
@ -150,13 +150,13 @@ class WSJ(BasicNewsRecipe):
|
|||||||
root = self.index_to_soup(url, as_tree=True)
|
root = self.index_to_soup(url, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
articles = []
|
articles = []
|
||||||
for container in root.xpath('descendant::div[contains(@class, "WSJTheme__list-item_")]'):
|
for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
|
||||||
heading = next(CSSSelect('h2, h3', container))
|
heading = next(CSSSelect('h2, h3', container))
|
||||||
a = next(CSSSelect('a', heading))
|
a = next(CSSSelect('a', heading))
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
desc = ''
|
desc = ''
|
||||||
for p in container.xpath('descendant::p[contains(@class, "WSJTheme__description_")]'):
|
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
|
||||||
q = self.tag_to_string(p)
|
q = self.tag_to_string(p)
|
||||||
if 'Subscriber Content' in q:
|
if 'Subscriber Content' in q:
|
||||||
continue
|
continue
|
||||||
@ -173,7 +173,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
def wsj_find_wn_articles(self, feeds, root, CSSSelect):
|
def wsj_find_wn_articles(self, feeds, root, CSSSelect):
|
||||||
articles = []
|
articles = []
|
||||||
for a in CSSSelect('.style__strap_2m6gCW_c_6WZKkU--eRUWv'):
|
for a in CSSSelect('.style--strap--3DsLojSy'):
|
||||||
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
|
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
|
||||||
whats_news = a.getparent()
|
whats_news = a.getparent()
|
||||||
break
|
break
|
||||||
@ -196,7 +196,6 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return articles
|
return articles
|
||||||
|
|
||||||
def wsj_add_feed(self, feeds, title, url):
|
def wsj_add_feed(self, feeds, title, url):
|
||||||
self.log('Found section:', title, '[' + url + ']')
|
|
||||||
try:
|
try:
|
||||||
articles = self.wsj_find_articles(url)
|
articles = self.wsj_find_articles(url)
|
||||||
if not articles:
|
if not articles:
|
||||||
@ -212,18 +211,21 @@ class WSJ(BasicNewsRecipe):
|
|||||||
# return self.test_wsj_index()
|
# return self.test_wsj_index()
|
||||||
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
|
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
|
# from calibre.utils.ipython import ipython
|
||||||
|
# ipython({'root': root, 'CSSSelect': CSSSelect, 'raw': self.wsj_itp_page})
|
||||||
for inp in CSSSelect('.DayPickerInput > input'):
|
for inp in CSSSelect('.DayPickerInput > input'):
|
||||||
if inp.get('placeholder'):
|
if inp.get('placeholder'):
|
||||||
self.timefmt = inp.get('placeholder')
|
self.timefmt = inp.get('placeholder')
|
||||||
break
|
break
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
for a in CSSSelect('.WSJTheme__nav-container_sPVwT3FiPlWjFGtr5KH3d .WSJTheme__section-link_XGDsdx5qPlnC8BZPxQ63R'):
|
for a in CSSSelect('.WSJTheme--nav-container--sPVwT3Fi .WSJTheme--section-link--XGDsdx5q'):
|
||||||
frontpage = a.get('href').endswith('frontpage')
|
frontpage = a.get('href').endswith('frontpage')
|
||||||
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
|
self.log('Found section:', title, 'at', url)
|
||||||
self.wsj_add_feed(feeds, title, url)
|
self.wsj_add_feed(feeds, title, url)
|
||||||
if frontpage:
|
if frontpage:
|
||||||
self.wsj_find_wn_articles(feeds, root, CSSSelect)
|
self.wsj_find_wn_articles(feeds, root, CSSSelect)
|
||||||
|
@ -150,13 +150,13 @@ class WSJ(BasicNewsRecipe):
|
|||||||
root = self.index_to_soup(url, as_tree=True)
|
root = self.index_to_soup(url, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
articles = []
|
articles = []
|
||||||
for container in root.xpath('descendant::div[contains(@class, "WSJTheme__list-item_")]'):
|
for container in root.xpath('descendant::div[contains(@class, "WSJTheme--list-item-")]'):
|
||||||
heading = next(CSSSelect('h2, h3', container))
|
heading = next(CSSSelect('h2, h3', container))
|
||||||
a = next(CSSSelect('a', heading))
|
a = next(CSSSelect('a', heading))
|
||||||
title = self.tag_to_string(a)
|
title = self.tag_to_string(a)
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
desc = ''
|
desc = ''
|
||||||
for p in container.xpath('descendant::p[contains(@class, "WSJTheme__description_")]'):
|
for p in container.xpath('descendant::p[contains(@class, "WSJTheme--description-")]'):
|
||||||
q = self.tag_to_string(p)
|
q = self.tag_to_string(p)
|
||||||
if 'Subscriber Content' in q:
|
if 'Subscriber Content' in q:
|
||||||
continue
|
continue
|
||||||
@ -173,7 +173,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
def wsj_find_wn_articles(self, feeds, root, CSSSelect):
|
def wsj_find_wn_articles(self, feeds, root, CSSSelect):
|
||||||
articles = []
|
articles = []
|
||||||
for a in CSSSelect('.style__strap_2m6gCW_c_6WZKkU--eRUWv'):
|
for a in CSSSelect('.style--strap--3DsLojSy'):
|
||||||
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
|
if 'WHAT\'S NEWS' in self.tag_to_string(a).upper():
|
||||||
whats_news = a.getparent()
|
whats_news = a.getparent()
|
||||||
break
|
break
|
||||||
@ -196,7 +196,6 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return articles
|
return articles
|
||||||
|
|
||||||
def wsj_add_feed(self, feeds, title, url):
|
def wsj_add_feed(self, feeds, title, url):
|
||||||
self.log('Found section:', title, '[' + url + ']')
|
|
||||||
try:
|
try:
|
||||||
articles = self.wsj_find_articles(url)
|
articles = self.wsj_find_articles(url)
|
||||||
if not articles:
|
if not articles:
|
||||||
@ -212,18 +211,21 @@ class WSJ(BasicNewsRecipe):
|
|||||||
# return self.test_wsj_index()
|
# return self.test_wsj_index()
|
||||||
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
|
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
|
# from calibre.utils.ipython import ipython
|
||||||
|
# ipython({'root': root, 'CSSSelect': CSSSelect, 'raw': self.wsj_itp_page})
|
||||||
for inp in CSSSelect('.DayPickerInput > input'):
|
for inp in CSSSelect('.DayPickerInput > input'):
|
||||||
if inp.get('placeholder'):
|
if inp.get('placeholder'):
|
||||||
self.timefmt = inp.get('placeholder')
|
self.timefmt = inp.get('placeholder')
|
||||||
break
|
break
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
for a in CSSSelect('.WSJTheme__nav-container_sPVwT3FiPlWjFGtr5KH3d .WSJTheme__section-link_XGDsdx5qPlnC8BZPxQ63R'):
|
for a in CSSSelect('.WSJTheme--nav-container--sPVwT3Fi .WSJTheme--section-link--XGDsdx5q'):
|
||||||
frontpage = a.get('href').endswith('frontpage')
|
frontpage = a.get('href').endswith('frontpage')
|
||||||
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
title = self.tag_to_string(a).capitalize().strip().replace('U.s.', 'U.S.')
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
url = self.abs_wsj_url(a.get('href'))
|
url = self.abs_wsj_url(a.get('href'))
|
||||||
|
self.log('Found section:', title, 'at', url)
|
||||||
self.wsj_add_feed(feeds, title, url)
|
self.wsj_add_feed(feeds, title, url)
|
||||||
if frontpage:
|
if frontpage:
|
||||||
self.wsj_find_wn_articles(feeds, root, CSSSelect)
|
self.wsj_find_wn_articles(feeds, root, CSSSelect)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user