From 6b69b78eadce57463eab19d8b8aa2079abcd75a2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 29 Sep 2018 10:32:33 +0530 Subject: [PATCH] Update The New Zealand Herald --- recipes/nzherald.recipe | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/recipes/nzherald.recipe b/recipes/nzherald.recipe index f416536cbe..631089309e 100644 --- a/recipes/nzherald.recipe +++ b/recipes/nzherald.recipe @@ -1,5 +1,10 @@ from calibre.web.feeds.recipes import BasicNewsRecipe -import re + + +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class NewZealandHerald(BasicNewsRecipe): @@ -11,6 +16,15 @@ class NewZealandHerald(BasicNewsRecipe): language = 'en_NZ' oldest_article = 2.5 + keep_only_tags = [ + classes('article-header'), + dict(id='article-content'), + ] + + remove_tags = [ + classes('ad-container pb-f-video-video-player pb-f-article-related-articles social-shares') + ] + feeds = [ ('Business', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000003.xml'), @@ -36,8 +50,7 @@ class NewZealandHerald(BasicNewsRecipe): 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000008.xml'), ] - def print_version(self, url): - m = re.search(r'objectid=(\d+)', url) - if m is None: - return url - return 'http://www.nzherald.co.nz/news/print.cfm?pnum=1&objectid=' + m.group(1) + def preprocess_html(self, soup, *a): + for img in soup.findAll('img', attrs={'data-srcset': True}): + img['src'] = img['data-srcset'].split()[0] + return soup