mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-31 14:33:54 -04:00
Update The Wall Street Journal
This commit is contained in:
parent
da2635b9df
commit
9a6671c3ce
@ -10,16 +10,9 @@ from base64 import standard_b64encode
|
|||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
from css_selectors import Select
|
from css_selectors import Select
|
||||||
|
|
||||||
# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
|
|
||||||
# The content is then decrypted via javascript and displayed.
|
|
||||||
# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
|
|
||||||
# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
|
|
||||||
# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
|
|
||||||
# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
|
|
||||||
#
|
|
||||||
try:
|
try:
|
||||||
import urllib.parse as urlparse
|
import urllib.parse as urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -62,46 +55,37 @@ class WSJ(BasicNewsRecipe):
|
|||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
remove_attributes = ['style', 'data-scrim']
|
remove_attributes = ['style','height','width']
|
||||||
needs_subscription = needs_subscription
|
needs_subscription = needs_subscription
|
||||||
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.imageCaption{font-size:small; text-align:center;}
|
||||||
|
.sub-head{font-style:italic; color:#404040;}
|
||||||
|
.bylineWrap{font-size:small; text-align:left;}
|
||||||
|
'''
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
|
classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero'),
|
||||||
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
|
dict(name='section', attrs={'subscriptions-section':'content'})
|
||||||
dict(name='span', itemprop='author', rel='author'),
|
|
||||||
dict(name='article', id='article-contents articleBody'.split()),
|
|
||||||
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
|
|
||||||
classes('nc-exp-artbody errorNotFound'),
|
|
||||||
dict(attrs={'data-module-zone': 'article_snippet'}),
|
|
||||||
prefixed_classes(
|
|
||||||
'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
|
|
||||||
' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(id='right-rail'),
|
classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'),
|
||||||
dict(id='narrator-nav'),
|
dict(name='amp-iframe') # interactive graphics
|
||||||
dict(name='div', id='ad_and_popular'),
|
|
||||||
classes('strap-container right-rail comments-count-container insetButton insettipBox author-info'
|
|
||||||
' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
|
|
||||||
dict(name='span', attrs={
|
|
||||||
'data-country-code': True, 'data-ticker-code': True}),
|
|
||||||
dict(name='meta link button'.split()),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_soup(self, soup):
|
def preprocess_html(self, soup):
|
||||||
# Slideshow and expandable images need to be processed here to
|
for by in soup.findAll(**classes('bylineWrap')):
|
||||||
# set the src attribute correctly
|
for p in by.findAll('p'):
|
||||||
found = 0
|
p.name = 'span'
|
||||||
for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}):
|
for img in soup.findAll('amp-img'):
|
||||||
img['src'] = img['data-in-base-data-lazy']
|
img.name = 'img'
|
||||||
found += 1
|
if img['src'] == 'https://s.wsj.net/img/meta/wsj-social-share.png':
|
||||||
for img in soup.findAll('img', attrs={'data-enlarge': True}):
|
img.extract()
|
||||||
img['src'] = img['data-enlarge']
|
h2 = soup.find('h2', attrs={'class':'sub-head'})
|
||||||
found += 1
|
if h2:
|
||||||
if found:
|
h2.name = 'p'
|
||||||
self.log.debug('Found %d dynamic images in:' % found)
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
@ -215,9 +199,9 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
def abs_wsj_url(self, href, modify_query=True):
|
def abs_wsj_url(self, href, modify_query=True):
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = 'https://www.wsj.com' + href
|
href = 'https://www.wsj.com' + href.replace('/articles/', '/amp/articles/')
|
||||||
if modify_query:
|
if modify_query:
|
||||||
href = href.split('?')[0] + '?mod=djemalertNEWS'
|
href = href.replace('/articles/', '/amp/articles/')
|
||||||
return href
|
return href
|
||||||
|
|
||||||
def wsj_find_articles(self, url, ahed=False):
|
def wsj_find_articles(self, url, ahed=False):
|
||||||
@ -289,7 +273,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
self.log.warn('No articles found in', url)
|
self.log.warn('No articles found in', url)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return self.test_wsj_index()
|
return self.test_wsj_index()
|
||||||
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
|
root = self.index_to_soup(self.wsj_itp_page, as_tree=True)
|
||||||
CSSSelect = Select(root)
|
CSSSelect = Select(root)
|
||||||
# from calibre.utils.ipython import ipython
|
# from calibre.utils.ipython import ipython
|
||||||
|
@ -10,16 +10,9 @@ from base64 import standard_b64encode
|
|||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
from mechanize import Request
|
from mechanize import Request
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
|
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||||
from css_selectors import Select
|
from css_selectors import Select
|
||||||
|
|
||||||
# WSJ has started delivering the paywalled content encrypted even for logged in subscribers.
|
|
||||||
# The content is then decrypted via javascript and displayed.
|
|
||||||
# I could in theory reverse engineer their javascript and decrypt the content in the recipe,
|
|
||||||
# but this is too much effort, at least for me. If anybody wants to have a stab at it, feel free,
|
|
||||||
# the decryption code is in https://www.wsj.com/_next/static/chunks/fec483df-86515f08f3742e3f.js
|
|
||||||
# You can get the encrypted data from any wsj paywalled article page by searching for encryptedDataHash in the HTML.
|
|
||||||
#
|
|
||||||
try:
|
try:
|
||||||
import urllib.parse as urlparse
|
import urllib.parse as urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -62,46 +55,37 @@ class WSJ(BasicNewsRecipe):
|
|||||||
timefmt = ' [%a, %b %d, %Y]'
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
remove_attributes = ['style', 'data-scrim']
|
remove_attributes = ['style','height','width']
|
||||||
needs_subscription = needs_subscription
|
needs_subscription = needs_subscription
|
||||||
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
WSJ_ITP = 'https://www.wsj.com/print-edition/today'
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
.imageCaption{font-size:small; text-align:center;}
|
||||||
|
.sub-head{font-style:italic; color:#404040;}
|
||||||
|
.bylineWrap{font-size:small; text-align:left;}
|
||||||
|
'''
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
dict(substring_classes('StyledHeadline wsj-mosdo-Dek-Dek -Figure -Figcaption ArticleBodyContent__ -Paragraph')),
|
classes('wsj-article-headline-wrap articleLead bylineWrap bigTop-hero'),
|
||||||
dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
|
dict(name='section', attrs={'subscriptions-section':'content'})
|
||||||
dict(name='span', itemprop='author', rel='author'),
|
|
||||||
dict(name='article', id='article-contents articleBody'.split()),
|
|
||||||
dict(name='div', id='article_story_body ncTitleArea snipper-ad-login cx-snippet-overlay'.split()),
|
|
||||||
classes('nc-exp-artbody errorNotFound'),
|
|
||||||
dict(attrs={'data-module-zone': 'article_snippet'}),
|
|
||||||
prefixed_classes(
|
|
||||||
'Headline__StyledHeadline- MediaLayout__Layout- ArticleByline__Container-'
|
|
||||||
' ArticleTimestamp__Timestamp- ArticleBody__Container- PaywalledContent__PaywalledContentContainer- ArticleRoadblock__Container-'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(id='right-rail'),
|
classes('wsj-ad newsletter-inset media-object-video media-object-podcast podcast--iframe dynamic-inset-overflow-button'),
|
||||||
dict(id='narrator-nav'),
|
dict(name='amp-iframe') # interactive graphics
|
||||||
dict(name='div', id='ad_and_popular'),
|
|
||||||
classes('strap-container right-rail comments-count-container insetButton insettipBox author-info'
|
|
||||||
' media-object-video article_tools nc-exp-artmeta category type-InsetArticlesRelatedByType media-object-rich-text'),
|
|
||||||
dict(name='span', attrs={
|
|
||||||
'data-country-code': True, 'data-ticker-code': True}),
|
|
||||||
dict(name='meta link button'.split()),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
def preprocess_soup(self, soup):
|
def preprocess_html(self, soup):
|
||||||
# Slideshow and expandable images need to be processed here to
|
for by in soup.findAll(**classes('bylineWrap')):
|
||||||
# set the src attribute correctly
|
for p in by.findAll('p'):
|
||||||
found = 0
|
p.name = 'span'
|
||||||
for img in soup.findAll('img', attrs={'data-in-base-data-lazy': True}):
|
for img in soup.findAll('amp-img'):
|
||||||
img['src'] = img['data-in-base-data-lazy']
|
img.name = 'img'
|
||||||
found += 1
|
if img['src'] == 'https://s.wsj.net/img/meta/wsj-social-share.png':
|
||||||
for img in soup.findAll('img', attrs={'data-enlarge': True}):
|
img.extract()
|
||||||
img['src'] = img['data-enlarge']
|
h2 = soup.find('h2', attrs={'class':'sub-head'})
|
||||||
found += 1
|
if h2:
|
||||||
if found:
|
h2.name = 'p'
|
||||||
self.log.debug('Found %d dynamic images in:' % found)
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
@ -215,9 +199,9 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
def abs_wsj_url(self, href, modify_query=True):
|
def abs_wsj_url(self, href, modify_query=True):
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = 'https://www.wsj.com' + href
|
href = 'https://www.wsj.com' + href.replace('/articles/', '/amp/articles/')
|
||||||
if modify_query:
|
if modify_query:
|
||||||
href = href.split('?')[0] + '?mod=djemalertNEWS'
|
href = href.replace('/articles/', '/amp/articles/')
|
||||||
return href
|
return href
|
||||||
|
|
||||||
def wsj_find_articles(self, url, ahed=False):
|
def wsj_find_articles(self, url, ahed=False):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user