Update wsj_mag.recipe

This commit is contained in:
unkn0w7n 2024-12-01 13:38:27 +05:30 committed by Kovid Goyal
parent ef995a1ec5
commit 121e69cafe
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,8 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
import json import json
import time
from datetime import datetime, timedelta
from itertools import zip_longest from itertools import zip_longest
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -93,6 +91,7 @@ class WSJ(BasicNewsRecipe):
byl.insert(0, read) byl.insert(0, read)
url = soup.find('p', attrs={'id':'share-link'}) url = soup.find('p', attrs={'id':'share-link'})
if url: if url:
url.name = 'div'
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
panel = soup.find('panel', attrs={'id':'metadata'}) panel = soup.find('panel', attrs={'id':'metadata'})
@ -108,6 +107,11 @@ class WSJ(BasicNewsRecipe):
x.insert_after(BeautifulSoup(y, 'html.parser')) x.insert_after(BeautifulSoup(y, 'html.parser'))
return soup return soup
def postprocess_html(self, soup, first_fetch):
for pan in soup.findAll('panel'):
pan.name = 'div'
return soup
def get_browser(self, *args, **kw): def get_browser(self, *args, **kw):
kw['user_agent'] = 'okhttp/4.10.0' kw['user_agent'] = 'okhttp/4.10.0'
br = BasicNewsRecipe.get_browser(self, *args, **kw) br = BasicNewsRecipe.get_browser(self, *args, **kw)
@ -122,23 +126,16 @@ class WSJ(BasicNewsRecipe):
index = 'https://bartender.mobile.dowjones.io' index = 'https://bartender.mobile.dowjones.io'
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
for itm in catalog['items']: for itm in catalog['items']:
if itm['type'] == 'MAG': if itm['type'] == 'ITP':
date = itm['date']
manifest = itm['manifest'] manifest = itm['manifest']
self.title = 'WSJ. Magazine: ' + itm['label'] date = itm['date']
break
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b, %Y')
self.log('Downloading ', dt)
self.timefmt = ' [' + dt + ']'
feeds = [] feeds = []
manif = json.loads(self.index_to_soup(index + manifest, raw=True)) manif = json.loads(self.index_to_soup(index + manifest, raw=True))
for itm in manif['items']: for itm in manif['items']:
for k, v in itm.items(): for k, v in itm.items():
if 'WSJMAG_PP' in k:
self.cover_url = v
if '-pages_' in k: if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ') section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' not in section: if 'MAGAZINE' not in section:
@ -149,13 +146,6 @@ class WSJ(BasicNewsRecipe):
sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) sec_parse = json.loads(self.index_to_soup(index + v, raw=True))
data = sec_parse['articles'] data = sec_parse['articles']
for art in data:
title = data[art]['headline']
desc = data[art]['summary']
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename']
self.log(' ', title, '\n\t', desc)
articles.append({'title': title, 'description':desc, 'url': url})
data = sec_parse['decos']
for art in data: for art in data:
title = data[art]['headline'] title = data[art]['headline']
desc = data[art]['summary'] desc = data[art]['summary']
@ -169,6 +159,6 @@ class WSJ(BasicNewsRecipe):
return BeautifulSoup(raw).prettify() return BeautifulSoup(raw).prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('p', attrs={'id':'share-link'}) lnk = soup.find('div', attrs={'id':'share-link'})
if lnk: if lnk:
article.url = lnk['title'] article.url = lnk['title']