From 6bf762aafae51d7cdc09ea7698d2328698bf78ec Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:16:22 +0530 Subject: [PATCH 1/2] ... --- recipes/wsj.recipe | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index ea8d158f91..7102da253d 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -1,6 +1,7 @@ import json import re import time +from itertools import zip_longest from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -12,6 +13,17 @@ from html5_parser import parse past_edition = None +def media_bucket(x): + if x.get('type', '') == 'image': + return '
{}
\n'.format( + x['manifest-url'], x['caption'] + ' ' + x['credit'] + ) + if x.get('type', '') == 'video': + return '
{}
\n'.format( + x['thumbnail_url'], x['caption'] + ' ' + x['credit'] + ) + return + class WSJ(BasicNewsRecipe): title = 'The Wall Street Journal' __author__ = 'unkn0wn' @@ -36,13 +48,13 @@ class WSJ(BasicNewsRecipe): remove_tags = [ dict(name=['nav', 'svg', 'iframe', 'source']), - dict(name='panel', attrs={'id':'metadata'}), + dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='span', attrs={'embed':'ticker'}), classes('lamrelated-articles-inset-panel'), dict(name='p', attrs={'id':[ - 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', + 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', 'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' ]}), dict(attrs={'data-inset_type':'dynamic'}), @@ -76,8 +88,6 @@ class WSJ(BasicNewsRecipe): if url: url['title'] = self.tag_to_string(url).strip() url.string = '' - for img in soup.findAll('img', attrs={'location':True}): - img['src'] = img['location'] for figc in soup.findAll('figcaption'): figc['class'] = 'figc' col = soup.find('div', text = re.compile('What to Read Next')) @@ -93,6 +103,17 @@ class WSJ(BasicNewsRecipe): for img in soup.findAll('img', src=True): if img['src'].endswith('/OR'): img['src'] = img['src'][:-3] + panel = soup.find('panel', attrs={'id':'metadata'}) + if panel: + buck = panel.find('p', attrs={'id':'media-bucket'}) + if buck: + data = json.loads(buck.string) + buck.extract() + i_lst = [media_bucket(x) for x in data['items']] + m_itm = soup.findAll('panel', attrs={'class':'media-item'}) + if i_lst and m_itm: + for x, y in list(zip_longest(m_itm, i_lst)): + x.insert_after(BeautifulSoup(y, 'html.parser')) return soup if not past_edition: @@ -126,16 +147,16 @@ class WSJ(BasicNewsRecipe): def parse_index(self): index = 'https://bartender.mobile.dowjones.io' catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) - edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:] + edit = [itm['key'][10:] for itm in catalog['items'] if itm['type'] == 'ITPNEXTGEN'][1:] self.log('**Past Editions available :', ', '.join(edit)) for itm in catalog['items']: if past_edition: - if itm['key'] == 'ITP' + past_edition: + if itm['key'] == 'ITPNEXTGEN' + past_edition: key = itm['key'] manifest = itm['manifest'] date = itm['date'] break - elif itm['type'] == 'ITP': + elif itm['type'] == 'ITPNEXTGEN': key = itm['key'] manifest = itm['manifest'] date = itm['date'] @@ -153,6 +174,8 @@ class WSJ(BasicNewsRecipe): for k, v in itm.items(): if '-pages_' in k: section = k.split('-pages_')[0].replace('_', ' ') + if 'MAGAZINE' in section: + continue self.log(section) articles = [] From 27bc04f77fb25035abca4c5f2b854a22cca6a957 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:17:39 +0530 Subject: [PATCH 2/2] Update the_week.recipe --- recipes/the_week.recipe | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes/the_week.recipe b/recipes/the_week.recipe index 793f0089fb..040c7174af 100644 --- a/recipes/the_week.recipe +++ b/recipes/the_week.recipe @@ -24,7 +24,6 @@ class TheWeek(BasicNewsRecipe): remove_tags_after = [classes('articlecontentbody')] extra_css = ''' - img {display:block; margin:0 auto;} em, blockquote { color: #202020; } .article-image, .article-imgbox { text-align:center; font-size:small; } .article-info { font-size:small; }