From 0c678a1dc4f099202c18a123d2ba44f3bed21097 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 27 Dec 2020 13:38:22 +0530 Subject: [PATCH] Update Miami Herald Fixes #1906178 [[Enhancement] Fetch news: 30+ McClatchy newspapers](https://bugs.launchpad.net/calibre/+bug/1906178) --- recipes/miami_herald.recipe | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/recipes/miami_herald.recipe b/recipes/miami_herald.recipe index f8242e3d77..c5d0328dff 100644 --- a/recipes/miami_herald.recipe +++ b/recipes/miami_herald.recipe @@ -8,6 +8,12 @@ miamiherald.com from calibre.web.feeds.news import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class TheMiamiHerald(BasicNewsRecipe): title = 'The Miami Herald' __author__ = 'Kovid Goyal' @@ -25,9 +31,10 @@ class TheMiamiHerald(BasicNewsRecipe): } keep_only_tags = [ + classes('story-body') ] - remove_tags = [ + classes('social-network-macro social-media') ] feeds = [ @@ -47,3 +54,19 @@ class TheMiamiHerald(BasicNewsRecipe): (u'Environment', u'https://www.miamiherald.com/news/local/environment/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'), (u'Sports', u'https://www.miamiherald.com/sports/?widgetName=rssfeed&widgetContentId=712015&getXmlFeed=true'), ] + + def get_browser(self, *a, **kw): + # MyClatchy servers dont like the user-agent header, they hang forever + # when it is present + br = BasicNewsRecipe.get_browser(self, *a, **kw) + br.addheaders = [x for x in br.addheaders if x[0].lower() != 'user-agent'] + return br + + def preprocess_html(self, soup): + for picture in soup.findAll('picture'): + img = picture.find('img') + for i, source in enumerate(picture.findAll('source')): + if i == 0: + img['src'] = source['srcset'].split()[0] + source.extract() + return soup