Update wsj.recipe

This commit is contained in:
unkn0w7n 2024-06-11 12:32:25 +05:30
parent a8c874b398
commit 5a9b329ba0

View File

@ -1,6 +1,7 @@
import json import json
import re import re
import time import time
from itertools import zip_longest
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
@ -12,6 +13,17 @@ from html5_parser import parse
past_edition = None past_edition = None
def media_bucket(x):
if x.get('type', '') == 'image':
return '<img src="{}"><div class="figc">{}</div>\n'.format(
x['manifest-url'], x['caption'] + ' ' + x['credit']
)
if x.get('type', '') == 'video':
return '<img src="{}"><div class="figc">{}</div>\n'.format(
x['thumbnail_url'], x['caption'] + ' ' + x['credit']
)
return
class WSJ(BasicNewsRecipe): class WSJ(BasicNewsRecipe):
title = 'The Wall Street Journal' title = 'The Wall Street Journal'
__author__ = 'unkn0wn' __author__ = 'unkn0wn'
@ -36,13 +48,13 @@ class WSJ(BasicNewsRecipe):
remove_tags = [ remove_tags = [
dict(name=['nav', 'svg', 'iframe', 'source']), dict(name=['nav', 'svg', 'iframe', 'source']),
dict(name='panel', attrs={'id':'metadata'}), dict(name='panel', attrs={'id':'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'layout':'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='panel', attrs={'embed':'inner-article-ad'}),
dict(name='span', attrs={'embed':'ticker'}), dict(name='span', attrs={'embed':'ticker'}),
classes('lamrelated-articles-inset-panel'), classes('lamrelated-articles-inset-panel'),
dict(name='p', attrs={'id':[ dict(name='p', attrs={'id':[
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' 'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
]}), ]}),
dict(attrs={'data-inset_type':'dynamic'}), dict(attrs={'data-inset_type':'dynamic'}),
@ -76,8 +88,6 @@ class WSJ(BasicNewsRecipe):
if url: if url:
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
for img in soup.findAll('img', attrs={'location':True}):
img['src'] = img['location']
for figc in soup.findAll('figcaption'): for figc in soup.findAll('figcaption'):
figc['class'] = 'figc' figc['class'] = 'figc'
col = soup.find('div', text = re.compile('What to Read Next')) col = soup.find('div', text = re.compile('What to Read Next'))
@ -93,6 +103,17 @@ class WSJ(BasicNewsRecipe):
for img in soup.findAll('img', src=True): for img in soup.findAll('img', src=True):
if img['src'].endswith('/OR'): if img['src'].endswith('/OR'):
img['src'] = img['src'][:-3] img['src'] = img['src'][:-3]
panel = soup.find('panel', attrs={'id':'metadata'})
if panel:
buck = panel.find('p', attrs={'id':'media-bucket'})
if buck:
data = json.loads(buck.string)
buck.extract()
i_lst = [media_bucket(x) for x in data['items']]
m_itm = soup.findAll('panel', attrs={'class':'media-item'})
if i_lst and m_itm:
for x, y in list(zip_longest(m_itm[::-1], i_lst[::-1])):
x.insert_after(BeautifulSoup(y, 'html.parser'))
return soup return soup
if not past_edition: if not past_edition:
@ -126,16 +147,16 @@ class WSJ(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
index = 'https://bartender.mobile.dowjones.io' index = 'https://bartender.mobile.dowjones.io'
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True)) catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:] edit = [itm['key'][10:] for itm in catalog['items'] if itm['type'] == 'ITPNEXTGEN'][1:]
self.log('**Past Editions available :', ', '.join(edit)) self.log('**Past Editions available :', ', '.join(edit))
for itm in catalog['items']: for itm in catalog['items']:
if past_edition: if past_edition:
if itm['key'] == 'ITP' + past_edition: if itm['key'] == 'ITPNEXTGEN' + past_edition:
key = itm['key'] key = itm['key']
manifest = itm['manifest'] manifest = itm['manifest']
date = itm['date'] date = itm['date']
break break
elif itm['type'] == 'ITP': elif itm['type'] == 'ITPNEXTGEN':
key = itm['key'] key = itm['key']
manifest = itm['manifest'] manifest = itm['manifest']
date = itm['date'] date = itm['date']
@ -153,6 +174,8 @@ class WSJ(BasicNewsRecipe):
for k, v in itm.items(): for k, v in itm.items():
if '-pages_' in k: if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ') section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' in section:
continue
self.log(section) self.log(section)
articles = [] articles = []