mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update wsj_mag.recipe
This commit is contained in:
parent
ef995a1ec5
commit
121e69cafe
@ -1,8 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
import json
|
import json
|
||||||
import time
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from itertools import zip_longest
|
from itertools import zip_longest
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
@ -93,6 +91,7 @@ class WSJ(BasicNewsRecipe):
|
|||||||
byl.insert(0, read)
|
byl.insert(0, read)
|
||||||
url = soup.find('p', attrs={'id':'share-link'})
|
url = soup.find('p', attrs={'id':'share-link'})
|
||||||
if url:
|
if url:
|
||||||
|
url.name = 'div'
|
||||||
url['title'] = self.tag_to_string(url).strip()
|
url['title'] = self.tag_to_string(url).strip()
|
||||||
url.string = ''
|
url.string = ''
|
||||||
panel = soup.find('panel', attrs={'id':'metadata'})
|
panel = soup.find('panel', attrs={'id':'metadata'})
|
||||||
@ -108,6 +107,11 @@ class WSJ(BasicNewsRecipe):
|
|||||||
x.insert_after(BeautifulSoup(y, 'html.parser'))
|
x.insert_after(BeautifulSoup(y, 'html.parser'))
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
for pan in soup.findAll('panel'):
|
||||||
|
pan.name = 'div'
|
||||||
|
return soup
|
||||||
|
|
||||||
def get_browser(self, *args, **kw):
|
def get_browser(self, *args, **kw):
|
||||||
kw['user_agent'] = 'okhttp/4.10.0'
|
kw['user_agent'] = 'okhttp/4.10.0'
|
||||||
br = BasicNewsRecipe.get_browser(self, *args, **kw)
|
br = BasicNewsRecipe.get_browser(self, *args, **kw)
|
||||||
@ -122,23 +126,16 @@ class WSJ(BasicNewsRecipe):
|
|||||||
index = 'https://bartender.mobile.dowjones.io'
|
index = 'https://bartender.mobile.dowjones.io'
|
||||||
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
|
||||||
for itm in catalog['items']:
|
for itm in catalog['items']:
|
||||||
if itm['type'] == 'MAG':
|
if itm['type'] == 'ITP':
|
||||||
date = itm['date']
|
|
||||||
manifest = itm['manifest']
|
manifest = itm['manifest']
|
||||||
self.title = 'WSJ. Magazine: ' + itm['label']
|
date = itm['date']
|
||||||
|
break
|
||||||
dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
|
|
||||||
dt = dt.strftime('%b, %Y')
|
|
||||||
self.log('Downloading ', dt)
|
|
||||||
self.timefmt = ' [' + dt + ']'
|
|
||||||
|
|
||||||
feeds = []
|
feeds = []
|
||||||
|
|
||||||
manif = json.loads(self.index_to_soup(index + manifest, raw=True))
|
manif = json.loads(self.index_to_soup(index + manifest, raw=True))
|
||||||
for itm in manif['items']:
|
for itm in manif['items']:
|
||||||
for k, v in itm.items():
|
for k, v in itm.items():
|
||||||
if 'WSJMAG_PP' in k:
|
|
||||||
self.cover_url = v
|
|
||||||
if '-pages_' in k:
|
if '-pages_' in k:
|
||||||
section = k.split('-pages_')[0].replace('_', ' ')
|
section = k.split('-pages_')[0].replace('_', ' ')
|
||||||
if 'MAGAZINE' not in section:
|
if 'MAGAZINE' not in section:
|
||||||
@ -149,13 +146,6 @@ class WSJ(BasicNewsRecipe):
|
|||||||
|
|
||||||
sec_parse = json.loads(self.index_to_soup(index + v, raw=True))
|
sec_parse = json.loads(self.index_to_soup(index + v, raw=True))
|
||||||
data = sec_parse['articles']
|
data = sec_parse['articles']
|
||||||
for art in data:
|
|
||||||
title = data[art]['headline']
|
|
||||||
desc = data[art]['summary']
|
|
||||||
url = index + manifest.rsplit('/', 1)[0] + '/' + data[art]['filename']
|
|
||||||
self.log(' ', title, '\n\t', desc)
|
|
||||||
articles.append({'title': title, 'description':desc, 'url': url})
|
|
||||||
data = sec_parse['decos']
|
|
||||||
for art in data:
|
for art in data:
|
||||||
title = data[art]['headline']
|
title = data[art]['headline']
|
||||||
desc = data[art]['summary']
|
desc = data[art]['summary']
|
||||||
@ -169,6 +159,6 @@ class WSJ(BasicNewsRecipe):
|
|||||||
return BeautifulSoup(raw).prettify()
|
return BeautifulSoup(raw).prettify()
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
lnk = soup.find('p', attrs={'id':'share-link'})
|
lnk = soup.find('div', attrs={'id':'share-link'})
|
||||||
if lnk:
|
if lnk:
|
||||||
article.url = lnk['title']
|
article.url = lnk['title']
|
||||||
|
Loading…
x
Reference in New Issue
Block a user