Update ProPublica

Fixes #1895457 [Pro Publica news only creates Table of Contents, no articles.](https://bugs.launchpad.net/calibre/+bug/1895457)
This commit is contained in:
Kovid Goyal 2020-09-13 21:39:44 +05:30
parent 4b58b7e550
commit aa1482d782
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -2,6 +2,12 @@
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})
class ProPublicaRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
@ -21,49 +27,25 @@ class ProPublicaRecipe(BasicNewsRecipe):
no_stylesheets = True
remove_javascript = True
keep_only_tags = []
keep_only_tags.append(dict(name='div', attrs={'class': 'article'}))
keep_only_tags = [
classes('title-wrapper content-wrapper article-header lead-art article-body')
]
remove_tags = [
classes('email-signup story-tools newsletter topics-list')
]
remove_tags_after = classes('article-body')
remove_tags = []
remove_tags.append(dict(name='div', attrs={'id': 'rollups'}))
remove_tags.append(dict(name='div', attrs={'class': 'follow_info'}))
remove_tags.append(dict(name='ul', attrs={'class': 'long-tools-top'}))
remove_tags.append(dict(name='ul', attrs={'id': 'share-box'}))
remove_tags.append(dict(name='div', attrs={'class': 'tags'}))
remove_tags.append(dict(name='ul', attrs={'class': 'long-tools'}))
remove_tags.append(dict(name='ul', attrs={'id': 'share-box2'}))
remove_tags.append(dict(name='p', attrs={'id': 'original-url'}))
feeds = [
(u'Top Stories', u'http://feeds.propublica.org/propublica/main'),
]
feeds = []
feeds.append(
(u'Top Stories', u'http://feeds.propublica.org/propublica/main'))
feeds.append(
(u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus'))
feeds.append(
(u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout'))
feeds.append(
(u'Business', u'http://feeds.propublica.org/propublica/business-money'))
feeds.append(
(u'Justice', u'http://feeds.propublica.org/propublica/justice-law'))
feeds.append((u'Energy & Environment',
u'http://feeds.propublica.org/propublica/energy-environment'))
feeds.append((u'Government & Politics',
u'http://feeds.propublica.org/propublica/government-politics'))
feeds.append(
(u'Health & Science', u'http://feeds.propublica.org/propublica/health-science'))
feeds.append((u'Media & Technology',
u'http://feeds.propublica.org/propublica/media-technology'))
feeds.append((u'National Security',
u'http://feeds.propublica.org/propublica/national-security'))
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
'publisher': publisher}
extra_css = '''
body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
img {float: left; margin-right: 0.5em;}
h1 {text-align: left;}
a, a[href] {text-decoration: none; color: blue;}
div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;}
div.info {font-size: small; color: #696969;}
'''
def preprocess_html(self, soup):
for img in soup.findAll('img', **classes('lazyload')):
source = img.findParent('picture')
if source is not None:
source = source.find('source')
if source is not None:
img['src'] = source['data-srcset']
for img in soup.findAll('img', srcset=True):
img['src'] = img['srcset'].split()[0]
return soup