From aa1482d782afb42bce3fff581d10c3e167326433 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 13 Sep 2020 21:39:44 +0530 Subject: [PATCH] Update ProPublica Fixes #1895457 [Pro Publica news only creates Table of Contents, no articles.](https://bugs.launchpad.net/calibre/+bug/1895457) --- recipes/propublica.recipe | 70 +++++++++++++++------------------------ 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/recipes/propublica.recipe b/recipes/propublica.recipe index 19d63d4215..f19e4928b2 100644 --- a/recipes/propublica.recipe +++ b/recipes/propublica.recipe @@ -2,6 +2,12 @@ from calibre.web.feeds.news import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class ProPublicaRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' @@ -21,49 +27,25 @@ class ProPublicaRecipe(BasicNewsRecipe): no_stylesheets = True remove_javascript = True - keep_only_tags = [] - keep_only_tags.append(dict(name='div', attrs={'class': 'article'})) + keep_only_tags = [ + classes('title-wrapper content-wrapper article-header lead-art article-body') + ] + remove_tags = [ + classes('email-signup story-tools newsletter topics-list') + ] + remove_tags_after = classes('article-body') - remove_tags = [] - remove_tags.append(dict(name='div', attrs={'id': 'rollups'})) - remove_tags.append(dict(name='div', attrs={'class': 'follow_info'})) - remove_tags.append(dict(name='ul', attrs={'class': 'long-tools-top'})) - remove_tags.append(dict(name='ul', attrs={'id': 'share-box'})) - remove_tags.append(dict(name='div', attrs={'class': 'tags'})) - remove_tags.append(dict(name='ul', attrs={'class': 'long-tools'})) - remove_tags.append(dict(name='ul', attrs={'id': 'share-box2'})) - remove_tags.append(dict(name='p', attrs={'id': 'original-url'})) + feeds = [ + (u'Top Stories', u'http://feeds.propublica.org/propublica/main'), + ] - feeds = [] - feeds.append( - (u'Top Stories', u'http://feeds.propublica.org/propublica/main')) - feeds.append( - (u'Stimulus', u'http://feeds.propublica.org/propublica/watchdog/stimulus')) - feeds.append( - (u'Bailout', u'http://feeds.propublica.org/propublica/watchdog/bailout')) - feeds.append( - (u'Business', u'http://feeds.propublica.org/propublica/business-money')) - feeds.append( - (u'Justice', u'http://feeds.propublica.org/propublica/justice-law')) - feeds.append((u'Energy & Environment', - u'http://feeds.propublica.org/propublica/energy-environment')) - feeds.append((u'Government & Politics', - u'http://feeds.propublica.org/propublica/government-politics')) - feeds.append( - (u'Health & Science', u'http://feeds.propublica.org/propublica/health-science')) - feeds.append((u'Media & Technology', - u'http://feeds.propublica.org/propublica/media-technology')) - feeds.append((u'National Security', - u'http://feeds.propublica.org/propublica/national-security')) - - conversion_options = {'comments': description, 'tags': category, 'language': 'en', - 'publisher': publisher} - - extra_css = ''' - body{font-family:verdana,arial,helvetica,geneva,sans-serif;} - img {float: left; margin-right: 0.5em;} - h1 {text-align: left;} - a, a[href] {text-decoration: none; color: blue;} - div.cat {font-size: x-small; color: #666666; margin-bottom: 0.1em;} - div.info {font-size: small; color: #696969;} - ''' + def preprocess_html(self, soup): + for img in soup.findAll('img', **classes('lazyload')): + source = img.findParent('picture') + if source is not None: + source = source.find('source') + if source is not None: + img['src'] = source['data-srcset'] + for img in soup.findAll('img', srcset=True): + img['src'] = img['srcset'].split()[0] + return soup