diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index ea8d158f91..6332cad019 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -1,6 +1,7 @@ import json import re import time +from itertools import zip_longest from datetime import datetime, timedelta from calibre.ebooks.BeautifulSoup import BeautifulSoup @@ -12,6 +13,17 @@ from html5_parser import parse past_edition = None +def media_bucket(x): + if x.get('type', '') == 'image': + return '
{}
\n'.format( + x['manifest-url'], x['caption'] + ' ' + x['credit'] + ) + if x.get('type', '') == 'video': + return '
{}
\n'.format( + x['thumbnail_url'], x['caption'] + ' ' + x['credit'] + ) + return + class WSJ(BasicNewsRecipe): title = 'The Wall Street Journal' __author__ = 'unkn0wn' @@ -36,13 +48,13 @@ class WSJ(BasicNewsRecipe): remove_tags = [ dict(name=['nav', 'svg', 'iframe', 'source']), - dict(name='panel', attrs={'id':'metadata'}), + dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='span', attrs={'embed':'ticker'}), classes('lamrelated-articles-inset-panel'), dict(name='p', attrs={'id':[ - 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', + 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', 'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' ]}), dict(attrs={'data-inset_type':'dynamic'}), @@ -76,8 +88,6 @@ class WSJ(BasicNewsRecipe): if url: url['title'] = self.tag_to_string(url).strip() url.string = '' - for img in soup.findAll('img', attrs={'location':True}): - img['src'] = img['location'] for figc in soup.findAll('figcaption'): figc['class'] = 'figc' col = soup.find('div', text = re.compile('What to Read Next')) @@ -93,6 +103,17 @@ class WSJ(BasicNewsRecipe): for img in soup.findAll('img', src=True): if img['src'].endswith('/OR'): img['src'] = img['src'][:-3] + panel = soup.find('panel', attrs={'id':'metadata'}) + if panel: + buck = panel.find('p', attrs={'id':'media-bucket'}) + if buck: + data = json.loads(buck.string) + buck.extract() + i_lst = [media_bucket(x) for x in data['items']] + m_itm = soup.findAll('panel', attrs={'class':'media-item'}) + if i_lst and m_itm: + for x, y in list(zip_longest(m_itm[::-1], i_lst[::-1])): + x.insert_after(BeautifulSoup(y, 'html.parser')) return soup if not past_edition: @@ -126,16 +147,16 @@ class WSJ(BasicNewsRecipe): def parse_index(self): index = 'https://bartender.mobile.dowjones.io' catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) - edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:] + edit = [itm['key'][10:] for itm in catalog['items'] if itm['type'] == 'ITPNEXTGEN'][1:] self.log('**Past Editions available :', ', '.join(edit)) for itm in catalog['items']: if past_edition: - if itm['key'] == 'ITP' + past_edition: + if itm['key'] == 'ITPNEXTGEN' + past_edition: key = itm['key'] manifest = itm['manifest'] date = itm['date'] break - elif itm['type'] == 'ITP': + elif itm['type'] == 'ITPNEXTGEN': key = itm['key'] manifest = itm['manifest'] date = itm['date'] @@ -153,6 +174,8 @@ class WSJ(BasicNewsRecipe): for k, v in itm.items(): if '-pages_' in k: section = k.split('-pages_')[0].replace('_', ' ') + if 'MAGAZINE' in section: + continue self.log(section) articles = []