mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update wsj.recipe
This commit is contained in:
parent
a8c874b398
commit
5a9b329ba0
@ -1,6 +1,7 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from itertools import zip_longest
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
@ -12,6 +13,17 @@ from html5_parser import parse
|
||||
past_edition = None
|
||||
|
||||
|
||||
def media_bucket(x):
|
||||
if x.get('type', '') == 'image':
|
||||
return '<img src="{}"><div class="figc">{}</div>\n'.format(
|
||||
x['manifest-url'], x['caption'] + ' ' + x['credit']
|
||||
)
|
||||
if x.get('type', '') == 'video':
|
||||
return '<img src="{}"><div class="figc">{}</div>\n'.format(
|
||||
x['thumbnail_url'], x['caption'] + ' ' + x['credit']
|
||||
)
|
||||
return
|
||||
|
||||
class WSJ(BasicNewsRecipe):
|
||||
title = 'The Wall Street Journal'
|
||||
__author__ = 'unkn0wn'
|
||||
@ -36,13 +48,13 @@ class WSJ(BasicNewsRecipe):
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['nav', 'svg', 'iframe', 'source']),
|
||||
dict(name='panel', attrs={'id':'metadata'}),
|
||||
dict(name='panel', attrs={'id':'summary-image'}),
|
||||
dict(name='panel', attrs={'layout':'inline'}),
|
||||
dict(name='panel', attrs={'embed':'inner-article-ad'}),
|
||||
dict(name='span', attrs={'embed':'ticker'}),
|
||||
classes('lamrelated-articles-inset-panel'),
|
||||
dict(name='p', attrs={'id':[
|
||||
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids',
|
||||
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
|
||||
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
|
||||
]}),
|
||||
dict(attrs={'data-inset_type':'dynamic'}),
|
||||
@ -76,8 +88,6 @@ class WSJ(BasicNewsRecipe):
|
||||
if url:
|
||||
url['title'] = self.tag_to_string(url).strip()
|
||||
url.string = ''
|
||||
for img in soup.findAll('img', attrs={'location':True}):
|
||||
img['src'] = img['location']
|
||||
for figc in soup.findAll('figcaption'):
|
||||
figc['class'] = 'figc'
|
||||
col = soup.find('div', text = re.compile('What to Read Next'))
|
||||
@ -93,6 +103,17 @@ class WSJ(BasicNewsRecipe):
|
||||
for img in soup.findAll('img', src=True):
|
||||
if img['src'].endswith('/OR'):
|
||||
img['src'] = img['src'][:-3]
|
||||
panel = soup.find('panel', attrs={'id':'metadata'})
|
||||
if panel:
|
||||
buck = panel.find('p', attrs={'id':'media-bucket'})
|
||||
if buck:
|
||||
data = json.loads(buck.string)
|
||||
buck.extract()
|
||||
i_lst = [media_bucket(x) for x in data['items']]
|
||||
m_itm = soup.findAll('panel', attrs={'class':'media-item'})
|
||||
if i_lst and m_itm:
|
||||
for x, y in list(zip_longest(m_itm[::-1], i_lst[::-1])):
|
||||
x.insert_after(BeautifulSoup(y, 'html.parser'))
|
||||
return soup
|
||||
|
||||
if not past_edition:
|
||||
@ -126,16 +147,16 @@ class WSJ(BasicNewsRecipe):
|
||||
def parse_index(self):
|
||||
index = 'https://bartender.mobile.dowjones.io'
|
||||
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
||||
edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
|
||||
edit = [itm['key'][10:] for itm in catalog['items'] if itm['type'] == 'ITPNEXTGEN'][1:]
|
||||
self.log('**Past Editions available :', ', '.join(edit))
|
||||
for itm in catalog['items']:
|
||||
if past_edition:
|
||||
if itm['key'] == 'ITP' + past_edition:
|
||||
if itm['key'] == 'ITPNEXTGEN' + past_edition:
|
||||
key = itm['key']
|
||||
manifest = itm['manifest']
|
||||
date = itm['date']
|
||||
break
|
||||
elif itm['type'] == 'ITP':
|
||||
elif itm['type'] == 'ITPNEXTGEN':
|
||||
key = itm['key']
|
||||
manifest = itm['manifest']
|
||||
date = itm['date']
|
||||
@ -153,6 +174,8 @@ class WSJ(BasicNewsRecipe):
|
||||
for k, v in itm.items():
|
||||
if '-pages_' in k:
|
||||
section = k.split('-pages_')[0].replace('_', ' ')
|
||||
if 'MAGAZINE' in section:
|
||||
continue
|
||||
self.log(section)
|
||||
|
||||
articles = []
|
||||
|
Loading…
x
Reference in New Issue
Block a user