Update WSJ. Magazine

This commit is contained in:
unkn0w7n 2025-06-08 14:32:35 +05:30
parent 31c8700f68
commit dddc7448d6
2 changed files with 124 additions and 60 deletions

View File

@ -32,8 +32,9 @@ class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
description = ( description = (
'The Print Edition of WSJ. The Wall Street Journal is your source for breaking news, analysis and insights from the U.S. and ' 'The Print Edition of WSJ. The Wall Street Journal is your source '
"around the world, the world's leading business and finance publication." 'for breaking news, analysis and insights from the U.S. and '
'around the world, the world\'s leading business and finance publication.'
) )
language = 'en_US' language = 'en_US'
masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png' masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'

View File

@ -2,11 +2,32 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
from itertools import zip_longest from itertools import zip_longest
from urllib.parse import quote, urlencode
from calibre import browser
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
def get_article(article_id):
from mechanize import Request
mat_url = 'https://mats.mobile.dowjones.io/translate/' + article_id + '/jpml'
headers = {
'User-Agent': 'okhttp/4.10.0',
'Accept-Encoding': 'gzip',
'Cache-Control': 'no-cache',
'x-api-key': ('e''0''5''9''9''5''f''f''4''4''2''1''4''3''2''5''5''e''b''8''3''8''1''f''7''2''d''4''9''1''3''b''f''7''5''0''3''d''6''c'), # noqa: ISC001
}
br = browser()
req = Request(
mat_url,
headers=headers,
)
res = br.open(req)
return res.read()
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'WSJ. Magazine' title = 'WSJ. Magazine'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
@ -21,13 +42,14 @@ class WSJ(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True resolve_internal_links = True
simultaneous_downloads = 20
recipe_specific_options = { recipe_specific_options = {
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} },
} }
extra_css = ''' extra_css = '''
@ -38,20 +60,35 @@ class WSJ(BasicNewsRecipe):
''' '''
remove_tags = [ remove_tags = [
dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'id': 'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'layout': 'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='panel', attrs={'embed': 'inner-article-ad'}),
dict(name='span', attrs={'embed':'ticker'}), dict(name='span', attrs={'embed': 'ticker'}),
classes('lamrelated-articles-inset-panel'), classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[ dict(
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', name='p',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' attrs={
]}), 'id': [
'keywords',
'orig-pubdate-number',
'type',
'is-custom-flashline',
'grouphed',
'author-ids',
'article-manifest',
'body-extract',
'category',
'sub-category',
'socialhed',
'summary',
'deckline',
'article-flashline',
]
},
),
] ]
remove_tags_before = [ remove_tags_before = [dict(name='p', attrs={'id': 'orig-pubdate-string'})]
dict(name='p', attrs={'id':'orig-pubdate-string'})
]
def media_bucket(self, x): def media_bucket(self, x):
res = '?width=600' res = '?width=600'
@ -59,16 +96,24 @@ class WSJ(BasicNewsRecipe):
if w and isinstance(w, str): if w and isinstance(w, str):
res = '?width=' + w res = '?width=' + w
if x.get('type', '') == 'image': if x.get('type', '') == 'image':
if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: if (
x.get('subtype', '') == 'graphic'
or 'images.wsj.net' not in x['manifest-url']
):
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
) )
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
) )
if x.get('type', '') == 'video': if x.get('type', '') == 'video':
return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format( return (
x['share_link'], x['thumbnail_url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
x['share_link'],
x['thumbnail_url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
)
) )
return return
@ -76,32 +121,32 @@ class WSJ(BasicNewsRecipe):
jpml = soup.find('jpml') jpml = soup.find('jpml')
if jpml: if jpml:
jpml.name = 'article' jpml.name = 'article'
h1 = soup.find('p', attrs={'id':'headline'}) h1 = soup.find('p', attrs={'id': 'headline'})
if h1: if h1:
h1.name = 'h1' h1.name = 'h1'
for h2 in soup.findAll('h2'): for h2 in soup.findAll('h2'):
h2.name = 'h4' h2.name = 'h4'
dt = soup.find('p', attrs={'id':'orig-pubdate-string'}) dt = soup.find('p', attrs={'id': 'orig-pubdate-string'})
read = soup.find('p', attrs={'id':'time-to-read'}) read = soup.find('p', attrs={'id': 'time-to-read'})
byl = soup.find('p', attrs={'id':'byline'}) byl = soup.find('p', attrs={'id': 'byline'})
fl = soup.find('p', attrs={'id':'flashline'}) fl = soup.find('p', attrs={'id': 'flashline'})
if dt and byl and read and fl: if dt and byl and read and fl:
dt.name = read.name = byl.name = fl.name = 'div' dt.name = read.name = byl.name = fl.name = 'div'
byl.insert(0, dt) byl.insert(0, dt)
byl.insert(0, read) byl.insert(0, read)
url = soup.find('p', attrs={'id':'share-link'}) url = soup.find('p', attrs={'id': 'share-link'})
if url: if url:
url.name = 'div' url.name = 'div'
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
panel = soup.find('panel', attrs={'id':'metadata'}) panel = soup.find('panel', attrs={'id': 'metadata'})
if panel: if panel:
buck = panel.find('p', attrs={'id':'media-bucket'}) buck = panel.find('p', attrs={'id': 'media-bucket'})
if buck: if buck:
data = json.loads(buck.string) data = json.loads(buck.string)
buck.extract() buck.extract()
i_lst = [self.media_bucket(x) for x in data['items']] i_lst = [self.media_bucket(x) for x in data['items']]
m_itm = soup.findAll('panel', attrs={'class':'media-item'}) m_itm = soup.findAll('panel', attrs={'class': 'media-item'})
if i_lst and m_itm: if i_lst and m_itm:
for x, y in list(zip_longest(m_itm, i_lst)): for x, y in list(zip_longest(m_itm, i_lst)):
x.insert_after(BeautifulSoup(y, 'html.parser')) x.insert_after(BeautifulSoup(y, 'html.parser'))
@ -113,51 +158,69 @@ class WSJ(BasicNewsRecipe):
return soup return soup
def get_browser(self, *args, **kw): def get_browser(self, *args, **kw):
kw['user_agent'] = 'okhttp/4.10.0'
br = BasicNewsRecipe.get_browser(self, *args, **kw) br = BasicNewsRecipe.get_browser(self, *args, **kw)
br.addheaders += [ br.addheaders += [
('Accept-Encoding', 'gzip'), ('apollographql-client-name', 'wsj-mobile-android-release'),
('cache-control', 'no-cache'),
('x-api-key', ('e''b''2''4''0''8''c''d''2''7''f''8''9''1''3''d''4''2''1''f''a''3''d''5''c''3''d''0''7''c''c''f''0''3''4''c''b''4''4''8')), # noqa: ISC001
] ]
return br return br
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' query = {
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) 'operationName': 'IssueQuery',
for itm in catalog['items']: 'variables': '{"publication":"WSJ","region":"US","masthead":"ITPNEXTGEN"}',
if itm['type'] == 'ITP': 'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"d938226e7d1c1fff050e7d084c72179e2713dcf4736d3a442c618c55b896f847"}}',
manifest = itm['manifest'] }
break url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
raw = self.index_to_soup(url, raw=True)
cat_data = json.loads(raw)['data']['mobileIssuesByMasthead']
for itm in cat_data:
sections_ = itm['sections']
break
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) sec = sections_[-1]
for itm in manif['items']: section = sec['label']
for k, v in itm.items(): self.log(section)
if '-pages_' in k: cont_id = sec['key']
section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' not in section:
continue
self.log(section)
articles = [] query = {
'operationName': 'SectionQuery',
'variables': '{{"id":"{}"}}'.format(cont_id),
'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"207fe93376f379bf223ed2734cf9313a28291293366a803db923666fa6b45026"}}',
}
sec_url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
sec_raw = self.index_to_soup(sec_url, raw=True)
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) sec_data = json.loads(sec_raw)['data']['summaryCollectionContent'][
data = sec_parse['articles'] 'collectionItems'
for art in data: ]
title = data[art]['headline']
desc = data[art]['summary'] articles = []
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename']
self.log(' ', title, '\n\t', desc) for art in sec_data:
articles.append({'title': title, 'description':desc, 'url': url}) for arts in art['collectionItems']:
feeds.append((section, articles)) mobi = arts['content']['mobileSummary']
title = mobi['headline']['text']
desc = mobi['description']['content']['text']
art_id = arts['id']
self.log(' ', title, '\n\t', desc)
art_cont = get_article(art_id)
pt = PersistentTemporaryFile('.html')
pt.write(art_cont)
pt.close()
url = 'file:///' + pt.name
articles.append({'title': title, 'description': desc, 'url': url})
feeds.append((section, articles))
return feeds return feeds
def preprocess_raw_html(self, raw, url):
return BeautifulSoup(raw).prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('div', attrs={'id':'share-link'}) lnk = soup.find('div', attrs={'id': 'share-link'})
if lnk: if lnk:
article.url = lnk['title'] article.url = lnk['title']