Update wsj.recipe

This commit is contained in:
unkn0w7n 2025-06-08 13:26:39 +05:30
parent 49934879c7
commit a0a1b8a8f3

View File

@ -1,10 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from itertools import zip_longest from itertools import zip_longest
from urllib.parse import quote, urlencode
from mechanize import Request
from calibre.ptempfile import PersistentTemporaryFile
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
@ -23,17 +25,18 @@ class WSJ(BasicNewsRecipe):
no_stylesheets = True no_stylesheets = True
remove_attributes = ['style', 'height', 'width'] remove_attributes = ['style', 'height', 'width']
resolve_internal_links = True resolve_internal_links = True
simultaneous_downloads = 20
recipe_specific_options = { recipe_specific_options = {
'date': { 'date': {
'short': 'The date of the edition to download (YYYYMMDD format)\nOnly the past 6 editions will be available ', 'short': 'The date of the edition to download (YYYY-MM-DD format)\nOnly the past 6 editions will be available ',
'long': 'For example, 20240513' 'long': 'For example, 2024-05-13',
}, },
'res': { 'res': {
'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500',
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.',
'default': '600' 'default': '600',
} },
} }
extra_css = ''' extra_css = '''
@ -49,15 +52,30 @@ class WSJ(BasicNewsRecipe):
dict(name='panel', attrs={'embed': 'inner-article-ad'}), dict(name='panel', attrs={'embed': 'inner-article-ad'}),
dict(name='span', attrs={'embed': 'ticker'}), dict(name='span', attrs={'embed': 'ticker'}),
classes('lamrelated-articles-inset-panel'), classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[ dict(
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', name='p',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' attrs={
]}), 'id': [
'keywords',
'orig-pubdate-number',
'type',
'is-custom-flashline',
'grouphed',
'author-ids',
'article-manifest',
'body-extract',
'category',
'sub-category',
'socialhed',
'summary',
'deckline',
'article-flashline',
]
},
),
] ]
remove_tags_before = [ remove_tags_before = [dict(name='p', attrs={'id': 'orig-pubdate-string'})]
dict(name='p', attrs={'id':'orig-pubdate-string'})
]
def media_bucket(self, x): def media_bucket(self, x):
res = '?width=600' res = '?width=600'
@ -65,16 +83,24 @@ class WSJ(BasicNewsRecipe):
if w and isinstance(w, str): if w and isinstance(w, str):
res = '?width=' + w res = '?width=' + w
if x.get('type', '') == 'image': if x.get('type', '') == 'image':
if x.get('subtype', '') == 'graphic' or 'images.wsj.net' not in x['manifest-url']: if (
x.get('subtype', '') == 'graphic'
or 'images.wsj.net' not in x['manifest-url']
):
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'], x['caption'] + '<i> ' + x['credit'] + '</i>'
) )
return '<br><img src="{}"><div class="figc">{}</div>\n'.format( return '<br><img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' x['manifest-url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
) )
if x.get('type', '') == 'video': if x.get('type', '') == 'video':
return '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format( return (
x['share_link'], x['thumbnail_url'].split('?')[0] + res, x['caption'] + '<i> ' + x['credit'] + '</i>' '<br><a href="{}"><img src="{}"></a><div class="figc">{}</div>\n'.format(
x['share_link'],
x['thumbnail_url'].split('?')[0] + res,
x['caption'] + '<i> ' + x['credit'] + '</i>',
)
) )
return return
@ -126,10 +152,14 @@ class WSJ(BasicNewsRecipe):
from calibre import browser from calibre import browser
from calibre.utils.img import save_cover_data_to from calibre.utils.img import save_cover_data_to
br = browser() br = browser()
raw = br.open('https://www.frontpages.com/the-wall-street-journal/') raw = br.open('https://www.frontpages.com/the-wall-street-journal/')
soup = BeautifulSoup(raw.read()) soup = BeautifulSoup(raw.read())
cu = 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src'] cu = (
'https://www.frontpages.com'
+ soup.find('img', attrs={'id': 'giornale-img'})['src']
)
self.report_progress(1, _('Downloading cover from %s') % cu) self.report_progress(1, _('Downloading cover from %s') % cu)
with closing(br.open(cu, timeout=self.timeout)) as r: with closing(br.open(cu, timeout=self.timeout)) as r:
cdata = r.read() cdata = r.read()
@ -138,61 +168,77 @@ class WSJ(BasicNewsRecipe):
self.cover_path = cpath self.cover_path = cpath
def get_browser(self, *args, **kw): def get_browser(self, *args, **kw):
kw['user_agent'] = 'okhttp/4.10.0'
br = BasicNewsRecipe.get_browser(self, *args, **kw) br = BasicNewsRecipe.get_browser(self, *args, **kw)
br.addheaders += [ br.addheaders += [
('Accept-Encoding', 'gzip'), ('apollographql-client-name', 'wsj-mobile-android-release'),
('cache-control', 'no-cache'),
('x-api-key', ('e''b''2''4''0''8''c''d''2''7''f''8''9''1''3''d''4''2''1''f''a''3''d''5''c''3''d''0''7''c''c''f''0''3''4''c''b''4''4''8')), # noqa: ISC001
] ]
return br return br
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' query = {
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) 'operationName': 'IssueQuery',
edit = [''.join([n for n in itm['key'] if n.isdigit()]) for itm in catalog['items'] if itm['type'] == 'ITP'][1:] 'variables': '{"publication":"WSJ","region":"US","masthead":"ITPNEXTGEN"}',
self.log('**Past Editions available :', ', '.join(edit)) 'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"d938226e7d1c1fff050e7d084c72179e2713dcf4736d3a442c618c55b896f847"}}',
}
url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
raw = self.index_to_soup(url, raw=True)
cat_data = json.loads(raw)
cat_data = json.loads(raw)['data']['mobileIssuesByMasthead']
edit = [x['datedLabel'] for x in cat_data][1:]
self.log('**Past Editions available : ' + ' | '.join(edit))
past_edition = self.recipe_specific_options.get('date') past_edition = self.recipe_specific_options.get('date')
for itm in catalog['items']: for itm in cat_data:
if past_edition and isinstance(past_edition, str): if past_edition and isinstance(past_edition, str):
if past_edition in itm['key']: if past_edition in itm['publishedDateUtc']:
manifest = itm['manifest'] self.timefmt = ' [' + itm['datedLabel']
date = itm['date'] sections_ = itm['sections']
break break
elif itm['type'] == 'ITP': self.timefmt = f' [{itm["datedLabel"]}]'
manifest = itm['manifest'] sections_ = itm['sections']
date = itm['date']
break break
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) self.log('Downloading ', self.timefmt)
dt_ = dt.strftime('%b %d, %Y')
self.log('Downloading ', dt_)
self.timefmt = ' [' + dt_ + ']'
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) for sec in sections_[:-1]:
for itm in manif['items']: section = sec['label']
for k, v in itm.items():
if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' in section:
if not (dt.day in {1, 2, 3, 4, 5, 6, 7} and dt.weekday() == 5):
continue
self.log('Loading Magazine section')
self.log(section) self.log(section)
cont_id = sec['key']
query = {
'operationName': 'SectionQuery',
'variables': '{{"id":"{}"}}'.format(cont_id),
'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"207fe93376f379bf223ed2734cf9313a28291293366a803db923666fa6b45026"}}',
}
sec_url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
query, safe='()!', quote_via=quote
)
sec_raw = self.index_to_soup(sec_url, raw=True)
sec_data = json.loads(sec_raw)['data']['summaryCollectionContent'][
'collectionItems'
]
articles = [] articles = []
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) for art in sec_data:
data = sec_parse['articles'] for arts in art['collectionItems']:
for art in data: mobi = arts['content']['mobileSummary']
title = data[art]['headline'] title = mobi['headline']['text']
desc = data[art]['summary'] desc = mobi['description']['content']['text']
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename'] art_id = arts['id']
self.log(' ', title, '\n\t', desc) self.log(' ', title, '\n\t', desc)
art_cont = self.get_article(art_id)
pt = PersistentTemporaryFile('.html')
pt.write(art_cont)
pt.close()
url = 'file:///' + pt.name
articles.append({'title': title, 'description': desc, 'url': url}) articles.append({'title': title, 'description': desc, 'url': url})
feeds.append((section, articles)) feeds.append((section, articles))
return feeds return feeds
@ -200,6 +246,23 @@ class WSJ(BasicNewsRecipe):
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
return BeautifulSoup(raw).prettify() return BeautifulSoup(raw).prettify()
def get_article(self, article_id):
from calibre import browser
mat_url = 'https://mats.mobile.dowjones.io/translate/' + article_id + '/jpml'
headers = {
'User-Agent': 'okhttp/4.10.0',
'Accept-Encoding': 'gzip',
'Cache-Control': 'no-cache',
'x-api-key': ('e''0''5''9''9''5''f''f''4''4''2''1''4''3''2''5''5''e''b''8''3''8''1''f''7''2''d''4''9''1''3''b''f''7''5''0''3''d''6''c'), # noqa: ISC001
}
br = browser()
req = Request(
mat_url,
headers=headers,
)
res = br.open(req)
return res.read()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('div', attrs={'id': 'share-link'}) lnk = soup.find('div', attrs={'id': 'share-link'})
if lnk: if lnk: