diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
index ea8d158f91..6332cad019 100644
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@@ -1,6 +1,7 @@
import json
import re
import time
+from itertools import zip_longest
from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup
@@ -12,6 +13,17 @@ from html5_parser import parse
past_edition = None
+def media_bucket(x):
+ if x.get('type', '') == 'image':
+ return '
{}
\n'.format(
+ x['manifest-url'], x['caption'] + ' ' + x['credit']
+ )
+ if x.get('type', '') == 'video':
+ return '
{}
\n'.format(
+ x['thumbnail_url'], x['caption'] + ' ' + x['credit']
+ )
+ return
+
class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'unkn0wn'
@@ -36,13 +48,13 @@ class WSJ(BasicNewsRecipe):
remove_tags = [
dict(name=['nav', 'svg', 'iframe', 'source']),
- dict(name='panel', attrs={'id':'metadata'}),
+ dict(name='panel', attrs={'id':'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}),
dict(name='span', attrs={'embed':'ticker'}),
classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[
- 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids',
+ 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
]}),
dict(attrs={'data-inset_type':'dynamic'}),
@@ -76,8 +88,6 @@ class WSJ(BasicNewsRecipe):
if url:
url['title'] = self.tag_to_string(url).strip()
url.string = ''
- for img in soup.findAll('img', attrs={'location':True}):
- img['src'] = img['location']
for figc in soup.findAll('figcaption'):
figc['class'] = 'figc'
col = soup.find('div', text = re.compile('What to Read Next'))
@@ -93,6 +103,17 @@ class WSJ(BasicNewsRecipe):
for img in soup.findAll('img', src=True):
if img['src'].endswith('/OR'):
img['src'] = img['src'][:-3]
+ panel = soup.find('panel', attrs={'id':'metadata'})
+ if panel:
+ buck = panel.find('p', attrs={'id':'media-bucket'})
+ if buck:
+ data = json.loads(buck.string)
+ buck.extract()
+ i_lst = [media_bucket(x) for x in data['items']]
+ m_itm = soup.findAll('panel', attrs={'class':'media-item'})
+ if i_lst and m_itm:
+ for x, y in list(zip_longest(m_itm[::-1], i_lst[::-1])):
+ x.insert_after(BeautifulSoup(y, 'html.parser'))
return soup
if not past_edition:
@@ -126,16 +147,16 @@ class WSJ(BasicNewsRecipe):
def parse_index(self):
index = 'https://bartender.mobile.dowjones.io'
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
- edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
+ edit = [itm['key'][10:] for itm in catalog['items'] if itm['type'] == 'ITPNEXTGEN'][1:]
self.log('**Past Editions available :', ', '.join(edit))
for itm in catalog['items']:
if past_edition:
- if itm['key'] == 'ITP' + past_edition:
+ if itm['key'] == 'ITPNEXTGEN' + past_edition:
key = itm['key']
manifest = itm['manifest']
date = itm['date']
break
- elif itm['type'] == 'ITP':
+ elif itm['type'] == 'ITPNEXTGEN':
key = itm['key']
manifest = itm['manifest']
date = itm['date']
@@ -153,6 +174,8 @@ class WSJ(BasicNewsRecipe):
for k, v in itm.items():
if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ')
+ if 'MAGAZINE' in section:
+ continue
self.log(section)
articles = []