mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update wsj.recipe
This commit is contained in:
parent
2dedc79fcd
commit
9f9cafaf1c
@ -1,12 +1,10 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from itertools import zip_longest
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
from html5_parser import parse
|
||||
|
||||
# Past 6 editions are available for download.
|
||||
# For available past editions see log and set date to, for example, '20240513'.
|
||||
@ -47,7 +45,6 @@ class WSJ(BasicNewsRecipe):
|
||||
'''
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['nav', 'svg', 'iframe', 'source']),
|
||||
dict(name='panel', attrs={'id':'summary-image'}),
|
||||
dict(name='panel', attrs={'layout':'inline'}),
|
||||
dict(name='panel', attrs={'embed':'inner-article-ad'}),
|
||||
@ -57,16 +54,11 @@ class WSJ(BasicNewsRecipe):
|
||||
'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids', 'article-manifest',
|
||||
'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
|
||||
]}),
|
||||
dict(attrs={'data-inset_type':'dynamic'}),
|
||||
dict(attrs={'data-block':'dynamic-inset'})
|
||||
]
|
||||
|
||||
remove_tags_before = [
|
||||
dict(name='p', attrs={'id':'orig-pubdate-string'})
|
||||
]
|
||||
remove_tags_after = [
|
||||
dict(name='article')
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
jpml = soup.find('jpml')
|
||||
@ -88,21 +80,6 @@ class WSJ(BasicNewsRecipe):
|
||||
if url:
|
||||
url['title'] = self.tag_to_string(url).strip()
|
||||
url.string = ''
|
||||
for figc in soup.findAll('figcaption'):
|
||||
figc['class'] = 'figc'
|
||||
col = soup.find('div', text = re.compile('What to Read Next'))
|
||||
if col:
|
||||
div = col.findParent('div')
|
||||
if div:
|
||||
div.extract()
|
||||
time = soup.find('time')
|
||||
if time:
|
||||
p = time.findParent('div')
|
||||
if p:
|
||||
p.name = 'p'
|
||||
for img in soup.findAll('img', src=True):
|
||||
if img['src'].endswith('/OR'):
|
||||
img['src'] = img['src'][:-3]
|
||||
panel = soup.find('panel', attrs={'id':'metadata'})
|
||||
if panel:
|
||||
buck = panel.find('p', attrs={'id':'media-bucket'})
|
||||
@ -185,38 +162,16 @@ class WSJ(BasicNewsRecipe):
|
||||
for art in data:
|
||||
title = data[art]['headline']
|
||||
desc = data[art]['summary']
|
||||
if 'articleWebViewLink' in data[art]:
|
||||
url = data[art]['articleWebViewLink']
|
||||
else:
|
||||
url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename']
|
||||
url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename']
|
||||
self.log(' ', title, '\n\t', desc)
|
||||
articles.append({'title': title, 'description':desc, 'url': url})
|
||||
feeds.append((section, articles))
|
||||
return feeds
|
||||
|
||||
def preprocess_raw_html(self, raw, url):
|
||||
if '/webview/' not in url:
|
||||
root = parse(raw)
|
||||
for x in root.xpath('//image'):
|
||||
x.tag = 'img'
|
||||
return BeautifulSoup(raw).prettify()
|
||||
else:
|
||||
soup = BeautifulSoup(raw)
|
||||
url = soup.find('meta', attrs={'property':'og:url'})
|
||||
if url:
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
h1['title'] = url['content']
|
||||
h2 = soup.find('h2')
|
||||
if h2:
|
||||
h2['id'] = 'subhed'
|
||||
h2.name = 'p'
|
||||
return soup.prettify()
|
||||
return BeautifulSoup(raw).prettify()
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
lnk = soup.find('p', attrs={'id':'share-link'})
|
||||
if lnk:
|
||||
article.url = lnk['title']
|
||||
art = soup.find('h1', attrs={'title':True})
|
||||
if art:
|
||||
article.url = art['title']
|
||||
|
Loading…
x
Reference in New Issue
Block a user