This commit is contained in:
Kovid Goyal 2024-06-12 16:41:50 +05:30
commit cdc5810486
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,12 +1,10 @@
import json import json
import re
import time import time
from itertools import zip_longest from itertools import zip_longest
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
from html5_parser import parse
# Past 6 editions are available for download. # Past 6 editions are available for download.
# For available past editions see log and set date to, for example, '20240513'. # For available past editions see log and set date to, for example, '20240513'.
@ -47,7 +45,6 @@ class WSJ(BasicNewsRecipe):
''' '''
remove_tags = [ remove_tags = [
dict(name=['nav', 'svg', 'iframe', 'source']),
dict(name='panel', attrs={'id':'summary-image'}), dict(name='panel', attrs={'id':'summary-image'}),
dict(name='panel', attrs={'layout':'inline'}), dict(name='panel', attrs={'layout':'inline'}),
dict(name='panel', attrs={'embed':'inner-article-ad'}), dict(name='panel', attrs={'embed':'inner-article-ad'}),
@ -57,16 +54,11 @@ class WSJ(BasicNewsRecipe):
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest', 'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline' 'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
]}), ]}),
dict(attrs={'data-inset_type':'dynamic'}),
dict(attrs={'data-block':'dynamic-inset'})
] ]
remove_tags_before = [ remove_tags_before = [
dict(name='p', attrs={'id':'orig-pubdate-string'}) dict(name='p', attrs={'id':'orig-pubdate-string'})
] ]
remove_tags_after = [
dict(name='article')
]
def preprocess_html(self, soup): def preprocess_html(self, soup):
jpml = soup.find('jpml') jpml = soup.find('jpml')
@ -88,21 +80,6 @@ class WSJ(BasicNewsRecipe):
if url: if url:
url['title'] = self.tag_to_string(url).strip() url['title'] = self.tag_to_string(url).strip()
url.string = '' url.string = ''
for figc in soup.findAll('figcaption'):
figc['class'] = 'figc'
col = soup.find('div', text = re.compile('What to Read Next'))
if col:
div = col.findParent('div')
if div:
div.extract()
time = soup.find('time')
if time:
p = time.findParent('div')
if p:
p.name = 'p'
for img in soup.findAll('img', src=True):
if img['src'].endswith('/OR'):
img['src'] = img['src'][:-3]
panel = soup.find('panel', attrs={'id':'metadata'}) panel = soup.find('panel', attrs={'id':'metadata'})
if panel: if panel:
buck = panel.find('p', attrs={'id':'media-bucket'}) buck = panel.find('p', attrs={'id':'media-bucket'})
@ -175,7 +152,9 @@ class WSJ(BasicNewsRecipe):
if '-pages_' in k: if '-pages_' in k:
section = k.split('-pages_')[0].replace('_', ' ') section = k.split('-pages_')[0].replace('_', ' ')
if 'MAGAZINE' in section: if 'MAGAZINE' in section:
if not datetime.now().strftime("%d") == 1:
continue continue
self.log('Loading Magazine section')
self.log(section) self.log(section)
articles = [] articles = []
@ -185,9 +164,6 @@ class WSJ(BasicNewsRecipe):
for art in data: for art in data:
title = data[art]['headline'] title = data[art]['headline']
desc = data[art]['summary'] desc = data[art]['summary']
if 'articleWebViewLink' in data[art]:
url = data[art]['articleWebViewLink']
else:
url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename'] url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename']
self.log(' ', title, '\n\t', desc) self.log(' ', title, '\n\t', desc)
articles.append({'title': title, 'description':desc, 'url': url}) articles.append({'title': title, 'description':desc, 'url': url})
@ -195,28 +171,9 @@ class WSJ(BasicNewsRecipe):
return feeds return feeds
def preprocess_raw_html(self, raw, url): def preprocess_raw_html(self, raw, url):
if '/webview/' not in url:
root = parse(raw)
for x in root.xpath('//image'):
x.tag = 'img'
return BeautifulSoup(raw).prettify() return BeautifulSoup(raw).prettify()
else:
soup = BeautifulSoup(raw)
url = soup.find('meta', attrs={'property':'og:url'})
if url:
h1 = soup.find('h1')
if h1:
h1['title'] = url['content']
h2 = soup.find('h2')
if h2:
h2['id'] = 'subhed'
h2.name = 'p'
return soup.prettify()
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
lnk = soup.find('p', attrs={'id':'share-link'}) lnk = soup.find('p', attrs={'id':'share-link'})
if lnk: if lnk:
article.url = lnk['title'] article.url = lnk['title']
art = soup.find('h1', attrs={'title':True})
if art:
article.url = art['title']