Update wsj.recipe

This commit is contained in:
unkn0w7n 2024-06-12 10:44:58 +05:30
parent 2dedc79fcd
commit 9f9cafaf1c

View File

@ -1,12 +1,10 @@
import json import json
import re
import time import time
from itertools import zip_longest from itertools import zip_longest
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
from html5_parser import parse
# Past 6 editions are available for download. # Past 6 editions are available for download.
# For available past editions see log and set date to, for example, '20240513'. # For available past editions see log and set date to, for example, '20240513'.
@ -47,7 +45,6 @@ class WSJ(BasicNewsRecipe):
''' '''
remove_tags = [ remove_tags = [
dict(name=['nav', 'svg', 'iframe', 'source']),
dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'id':'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'layout':'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='panel', attrs={'embed':'inner-article-ad'}),
@ -57,16 +54,11 @@ class WSJ(BasicNewsRecipe):
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' 'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
]}), ]}),
dict(attrs={'data-inset_type':'dynamic'}),
dict(attrs={'data-block':'dynamic-inset'})
] ]
remove_tags_before = [ remove_tags_before = [
dict(name='p', attrs={'id':'orig-pubdate-string'}) dict(name='p', attrs={'id':'orig-pubdate-string'})
] ]
remove_tags_after = [
dict(name='article')
]
def preprocess_html(self, soup): def preprocess_html(self, soup):
jpml = soup.find('jpml') jpml = soup.find('jpml')
@ -88,21 +80,6 @@ class WSJ(BasicNewsRecipe):
if url: if url:
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
for figc in soup.findAll('figcaption'):
figc['class'] = 'figc'
col = soup.find('div', text = re.compile('What to Read Next'))
if col:
div = col.findParent('div')
if div:
div.extract()
time = soup.find('time')
if time:
p = time.findParent('div')
if p:
p.name = 'p'
for img in soup.findAll('img', src=True):
if img['src'].endswith('/OR'):
img['src'] = img['src'][:-3]
panel = soup.find('panel', attrs={'id':'metadata'}) panel = soup.find('panel', attrs={'id':'metadata'})
if panel: if panel:
buck = panel.find('p', attrs={'id':'media-bucket'}) buck = panel.find('p', attrs={'id':'media-bucket'})
@ -185,9 +162,6 @@ class WSJ(BasicNewsRecipe):
for art in data: for art in data:
title = data[art]['headline'] title = data[art]['headline']
desc = data[art]['summary'] desc = data[art]['summary']
if 'articleWebViewLink' in data[art]:
url = data[art]['articleWebViewLink']
else:
url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename'] url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename']
self.log(' ', title, '\n\t', desc) self.log(' ', title, '\n\t', desc)
articles.append({'title': title, 'description':desc, 'url': url}) articles.append({'title': title, 'description':desc, 'url': url})
@ -195,28 +169,9 @@ class WSJ(BasicNewsRecipe):
return feeds return feeds
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
if '/webview/' not in url:
root = parse(raw)
for x in root.xpath('//image'):
x.tag = 'img'
return BeautifulSoup(raw).prettify() return BeautifulSoup(raw).prettify()
else:
soup = BeautifulSoup(raw)
url = soup.find('meta', attrs={'property':'og:url'})
if url:
h1 = soup.find('h1')
if h1:
h1['title'] = url['content']
h2 = soup.find('h2')
if h2:
h2['id'] = 'subhed'
h2.name = 'p'
return soup.prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('p', attrs={'id':'share-link'})
if lnk:
article.url = lnk['title']
art = soup.find('h1', attrs={'title':True}) art = soup.find('h1', attrs={'title':True})
if art: if art:
article.url = art['title'] article.url = art['title']