mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update wsj.recipe
This commit is contained in:
parent
a8c874b398
commit
5a9b329ba0
@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
from itertools import zip_longest
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
@ -12,6 +13,17 @@ from html5_parser import parse
|
|||||||
past_edition = None
|
past_edition = None
|
||||||
|
|
||||||
|
|
||||||
|
def media_bucket(x):
|
||||||
|
if x.get('type', '') == 'image':
|
||||||
|
return '<img src="{}"><div class="figc">{}</div>\n'.format(
|
||||||
|
x['manifest-url'], x['caption'] + ' ' + x['credit']
|
||||||
|
)
|
||||||
|
if x.get('type', '') == 'video':
|
||||||
|
return '<img src="{}"><div class="figc">{}</div>\n'.format(
|
||||||
|
x['thumbnail_url'], x['caption'] + ' ' + x['credit']
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
class WSJ(BasicNewsRecipe):
|
class WSJ(BasicNewsRecipe):
|
||||||
title = 'The Wall Street Journal'
|
title = 'The Wall Street Journal'
|
||||||
__author__ = 'unkn0wn'
|
__author__ = 'unkn0wn'
|
||||||
@ -36,13 +48,13 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['nav', 'svg', 'iframe', 'source']),
|
dict(name=['nav', 'svg', 'iframe', 'source']),
|
||||||
dict(name='panel', attrs={'id':'metadata'}),
|
dict(name='panel', attrs={'id':'summary-image'}),
|
||||||
dict(name='panel', attrs={'layout':'inline'}),
|
dict(name='panel', attrs={'layout':'inline'}),
|
||||||
dict(name='panel', attrs={'embed':'inner-article-ad'}),
|
dict(name='panel', attrs={'embed':'inner-article-ad'}),
|
||||||
dict(name='span', attrs={'embed':'ticker'}),
|
dict(name='span', attrs={'embed':'ticker'}),
|
||||||
classes('lamrelated-articles-inset-panel'),
|
classes('lamrelated-articles-inset-panel'),
|
||||||
dict(name='p', attrs={'id':[
|
dict(name='p', attrs={'id':[
|
||||||
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids',
|
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
|
||||||
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
|
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
|
||||||
]}),
|
]}),
|
||||||
dict(attrs={'data-inset_type':'dynamic'}),
|
dict(attrs={'data-inset_type':'dynamic'}),
|
||||||
@ -76,8 +88,6 @@ class WSJ(BasicNewsRecipe):
|
|||||||
if url:
|
if url:
|
||||||
url['title'] = self.tag_to_string(url).strip()
|
url['title'] = self.tag_to_string(url).strip()
|
||||||
url.string = ''
|
url.string = ''
|
||||||
for img in soup.findAll('img', attrs={'location':True}):
|
|
||||||
img['src'] = img['location']
|
|
||||||
for figc in soup.findAll('figcaption'):
|
for figc in soup.findAll('figcaption'):
|
||||||
figc['class'] = 'figc'
|
figc['class'] = 'figc'
|
||||||
col = soup.find('div', text = re.compile('What to Read Next'))
|
col = soup.find('div', text = re.compile('What to Read Next'))
|
||||||
@ -93,6 +103,17 @@ class WSJ(BasicNewsRecipe):
|
|||||||
for img in soup.findAll('img', src=True):
|
for img in soup.findAll('img', src=True):
|
||||||
if img['src'].endswith('/OR'):
|
if img['src'].endswith('/OR'):
|
||||||
img['src'] = img['src'][:-3]
|
img['src'] = img['src'][:-3]
|
||||||
|
panel = soup.find('panel', attrs={'id':'metadata'})
|
||||||
|
if panel:
|
||||||
|
buck = panel.find('p', attrs={'id':'media-bucket'})
|
||||||
|
if buck:
|
||||||
|
data = json.loads(buck.string)
|
||||||
|
buck.extract()
|
||||||
|
i_lst = [media_bucket(x) for x in data['items']]
|
||||||
|
m_itm = soup.findAll('panel', attrs={'class':'media-item'})
|
||||||
|
if i_lst and m_itm:
|
||||||
|
for x, y in list(zip_longest(m_itm[::-1], i_lst[::-1])):
|
||||||
|
x.insert_after(BeautifulSoup(y, 'html.parser'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
if not past_edition:
|
if not past_edition:
|
||||||
@ -126,16 +147,16 @@ class WSJ(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
index = 'https://bartender.mobile.dowjones.io'
|
index = 'https://bartender.mobile.dowjones.io'
|
||||||
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
||||||
edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
|
edit = [itm['key'][10:] for itm in catalog['items'] if itm['type'] == 'ITPNEXTGEN'][1:]
|
||||||
self.log('**Past Editions available :', ', '.join(edit))
|
self.log('**Past Editions available :', ', '.join(edit))
|
||||||
for itm in catalog['items']:
|
for itm in catalog['items']:
|
||||||
if past_edition:
|
if past_edition:
|
||||||
if itm['key'] == 'ITP' + past_edition:
|
if itm['key'] == 'ITPNEXTGEN' + past_edition:
|
||||||
key = itm['key']
|
key = itm['key']
|
||||||
manifest = itm['manifest']
|
manifest = itm['manifest']
|
||||||
date = itm['date']
|
date = itm['date']
|
||||||
break
|
break
|
||||||
elif itm['type'] == 'ITP':
|
elif itm['type'] == 'ITPNEXTGEN':
|
||||||
key = itm['key']
|
key = itm['key']
|
||||||
manifest = itm['manifest']
|
manifest = itm['manifest']
|
||||||
date = itm['date']
|
date = itm['date']
|
||||||
@ -153,6 +174,8 @@ class WSJ(BasicNewsRecipe):
|
|||||||
for k, v in itm.items():
|
for k, v in itm.items():
|
||||||
if '-pages_' in k:
|
if '-pages_' in k:
|
||||||
section = k.split('-pages_')[0].replace('_', ' ')
|
section = k.split('-pages_')[0].replace('_', ' ')
|
||||||
|
if 'MAGAZINE' in section:
|
||||||
|
continue
|
||||||
self.log(section)
|
self.log(section)
|
||||||
|
|
||||||
articles = []
|
articles = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user