Update WSJ. Magazine

This commit is contained in:
unkn0w7n 2025-06-08 14:32:35 +05:30
parent 31c8700f68
commit dddc7448d6
2 changed files with 124 additions and 60 deletions

View File

@ -32,8 +32,9 @@ class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
description = ( description = (
'The Print Edition of WSJ. The Wall Street Journal is your source for breaking news, analysis and insights from the U.S. and ' 'The Print Edition of WSJ. The Wall Street Journal is your source '
"around the world, the world's leading business and finance publication." 'for breaking news, analysis and insights from the U.S. and '
'around the world, the world\'s leading business and finance publication.'
) )
language = 'en_US' language = 'en_US'
masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png' masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'

View File

@ -2,11 +2,32 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
from itertools import zip_longest from itertools import zip_longest
from urllib.parse import quote, urlencode
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
def get_article(article_id):
from mechanize import Request
mat_url = 'https://mats.mobile.dowjones.io/translate/' + article_id + '/jpml'
headers = {
'User-Agent': 'okhttp/4.10.0',
'Accept-Encoding': 'gzip',
'Cache-Control': 'no-cache',
'x-api-key': ('e''0''5''9''9''5''f''f''4''4''2''1''4''3''2''5''5''e''b''8''3''8''1''f''7''2''d''4''9''1''3''b''f''7''5''0''3''d''6''c'), # noqa: ISC001
}
br = browser()
req = Request(
mat_url,
headers=headers,
)
res = br.open(req)
return res.read()
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'WSJ. Magazine' title = 'WSJ. Magazine'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
@ -21,13 +42,14 @@ class WSJ(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True resolve_internal_links = True
simultaneous_downloads = 20
recipe_specific_options = { recipe_specific_options = {
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} },
} }
extra_css = ''' extra_css = '''
@ -43,15 +65,30 @@ class WSJ(BasicNewsRecipe):
dict(name='panel', attrs={'embed': 'inner-article-ad'}), dict(name='panel', attrs={'embed': 'inner-article-ad'}),
dict(name='span', attrs={'embed': 'ticker'}), dict(name='span', attrs={'embed': 'ticker'}),
classes('lamrelated-articles-inset-panel'), classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[ dict(
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', name='p',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' attrs={
]}), 'id': [
'keywords',
'orig-pubdate-number',
'type',
'is-custom-flashline',
'grouphed',
'author-ids',
'article-manifest',
'body-extract',
'category',
'sub-category',
'socialhed',
'summary',
'deckline',
'article-flashline',
]
},
),
] ]
remove_tags_before = [ remove_tags_before = [dict(name='p', attrs={'id': 'orig-pubdate-string'})]
dict(name='p', attrs={'id':'orig-pubdate-string'})
]
def media_bucket(self, x): def media_bucket(self, x):
res = '?width=600' res = '?width=600'
@ -59,16 +96,24 @@ class WSJ(BasicNewsRecipe):
if w and isinstance(w, str): if w and isinstance(w, str):
res = '?width=' + w res = '?width=' + w
if x.get('type', '') == 'image': if x.get('type', '') == 'image':
if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: if (
x.get('subtype', '') == 'graphic'
or 'images.wsj.net' not in x['manifest-url']
):
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
) )
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
) )
if x.get('type', '') == 'video': if x.get('type', '') == 'video':
return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format( return (
x['share_link'], x['thumbnail_url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
x['share_link'],
x['thumbnail_url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
)
) )
return return
@ -113,50 +158,68 @@ class WSJ(BasicNewsRecipe):
return soup return soup
def get_browser(self, *args, **kw): def get_browser(self, *args, **kw):
kw['user_agent'] = 'okhttp/4.10.0'
br = BasicNewsRecipe.get_browser(self, *args, **kw) br = BasicNewsRecipe.get_browser(self, *args, **kw)
br.addheaders += [ br.addheaders += [
('Accept-Encoding', 'gzip'), ('apollographql-client-name', 'wsj-mobile-android-release'),
('cache-control', 'no-cache'),
('x-api-key', ('e''b''2''4''0''8''c''d''2''7''f''8''9''1''3''d''4''2''1''f''a''3''d''5''c''3''d''0''7''c''c''f''0''3''4''c''b''4''4''8')), # noqa: ISC001
] ]
return br return br
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' query = {
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) 'operationName': 'IssueQuery',
for itm in catalog['items']: 'variables': '{"publication":"WSJ","region":"US","masthead":"ITPNEXTGEN"}',
if itm['type'] == 'ITP': 'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"d938226e7d1c1fff050e7d084c72179e2713dcf4736d3a442c618c55b896f847"}}',
manifest = itm['manifest'] }
url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
raw = self.index_to_soup(url, raw=True)
cat_data = json.loads(raw)['data']['mobileIssuesByMasthead']
for itm in cat_data:
sections_ = itm['sections']
break break
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) sec = sections_[-1]
for itm in manif['items']: section = sec['label']
for k, v in itm.items():
if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' not in section:
continue
self.log(section) self.log(section)
cont_id = sec['key']
query = {
'operationName': 'SectionQuery',
'variables': '{{"id":"{}"}}'.format(cont_id),
'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"207fe93376f379bf223ed2734cf9313a28291293366a803db923666fa6b45026"}}',
}
sec_url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
sec_raw = self.index_to_soup(sec_url, raw=True)
sec_data = json.loads(sec_raw)['data']['summaryCollectionContent'][
'collectionItems'
]
articles = [] articles = []
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) for art in sec_data:
data = sec_parse['articles'] for arts in art['collectionItems']:
for art in data: mobi = arts['content']['mobileSummary']
title = data[art]['headline'] title = mobi['headline']['text']
desc = data[art]['summary'] desc = mobi['description']['content']['text']
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename'] art_id = arts['id']
self.log(' ', title, '\n\t', desc) self.log(' ', title, '\n\t', desc)
art_cont = get_article(art_id)
pt = PersistentTemporaryFile('.html')
pt.write(art_cont)
pt.close()
url = 'file:///' + pt.name
articles.append({'title': title, 'description': desc, 'url': url}) articles.append({'title': title, 'description': desc, 'url': url})
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, url):
return BeautifulSoup(raw).prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('div', attrs={'id': 'share-link'}) lnk = soup.find('div', attrs={'id': 'share-link'})
if lnk: if lnk: