From 121e69cafe94ba1dcae9fda942ba358a33939643 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 1 Dec 2024 13:38:27 +0530 Subject: [PATCH] Update wsj_mag.recipe --- recipes/wsj_mag.recipe | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/recipes/wsj_mag.recipe b/recipes/wsj_mag.recipe index cae5fc0b68..a69e43ef13 100644 --- a/recipes/wsj_mag.recipe +++ b/recipes/wsj_mag.recipe @@ -1,8 +1,6 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 import json -import time -from datetime import datetime, timedelta from itertools import zip_longest from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -93,6 +91,7 @@ class WSJ(BasicNewsRecipe): byl.insert(0, read) url = soup.find('p', attrs={'id':'share-link'}) if url: + url.name = 'div' url['title'] = self.tag_to_string(url).strip() url.string = '' panel = soup.find('panel', attrs={'id':'metadata'}) @@ -108,6 +107,11 @@ class WSJ(BasicNewsRecipe): x.insert_after(BeautifulSoup(y, 'html.parser')) return soup + def postprocess_html(self, soup, first_fetch): + for pan in soup.findAll('panel'): + pan.name = 'div' + return soup + def get_browser(self, *args, **kw): kw['user_agent'] = 'okhttp/4.10.0' br = BasicNewsRecipe.get_browser(self, *args, **kw) @@ -122,23 +126,16 @@ class WSJ(BasicNewsRecipe): index = 'https://bartender.mobile.dowjones.io' catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) for itm in catalog['items']: - if itm['type'] == 'MAG': - date = itm['date'] + if itm['type'] == 'ITP': manifest = itm['manifest'] - self.title = 'WSJ. Magazine: ' + itm['label'] - - dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) - dt = dt.strftime('%b, %Y') - self.log('Downloading ', dt) - self.timefmt = ' [' + dt + ']' + date = itm['date'] + break feeds = [] manif = json.loads(self.index_to_soup(index + manifest, raw=True)) for itm in manif['items']: for k, v in itm.items(): - if 'WSJMAG_PP' in k: - self.cover_url = v if '-pages_' in k: section = k.split('-pages_')[0].replace('_', ' ') if 'MAGAZINE' not in section: @@ -149,13 +146,6 @@ class WSJ(BasicNewsRecipe): sec_parse = json.loads(self.index_to_soup(index + v, raw=True)) data = sec_parse['articles'] - for art in data: - title = data[art]['headline'] - desc = data[art]['summary'] - url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename'] - self.log(' ', title, '\n\t', desc) - articles.append({'title': title, 'description':desc, 'url': url}) - data = sec_parse['decos'] for art in data: title = data[art]['headline'] desc = data[art]['summary'] @@ -169,6 +159,6 @@ class WSJ(BasicNewsRecipe): return BeautifulSoup(raw).prettify() def populate_article_metadata(self, article, soup, first): - lnk = soup.find('p', attrs={'id':'share-link'}) + lnk = soup.find('div', attrs={'id':'share-link'}) if lnk: article.url = lnk['title']