From be6a1e9921ef2d1ff19187e02d0dc6f0d4b61186 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 5 Feb 2017 13:45:47 +0530 Subject: [PATCH] Update Bangkok Post Fixes #1661925 [No news content when fetching news from Bangkok Post & The Nation](https://bugs.launchpad.net/calibre/+bug/1661925) --- recipes/bangkokpost.recipe | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/recipes/bangkokpost.recipe b/recipes/bangkokpost.recipe index 8a17cb6d37..a276a55496 100644 --- a/recipes/bangkokpost.recipe +++ b/recipes/bangkokpost.recipe @@ -1,6 +1,12 @@ from calibre.web.feeds.news import BasicNewsRecipe +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) + + class BangkokPostRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' @@ -40,15 +46,12 @@ class BangkokPostRecipe(BasicNewsRecipe): feeds.append((u'Life', u'http://www.bangkokpost.com/rss/data/life.xml')) feeds.append((u'Tech', u'http://www.bangkokpost.com/rss/data/tect.xml')) - keep_only_tags = [] - keep_only_tags.append(dict(name='div', attrs={'class': 'entry'})) + keep_only_tags = [ + dict(itemprop='headline'), + classes('articleContents'), + ] - remove_tags = [] - remove_tags.append(dict(name='div', attrs={'class': 'article-features'})) - remove_tags.append(dict(name='div', attrs={'class': 'socialBookmark'})) - remove_tags.append(dict(name='div', attrs={'id': 'main-sns'})) - # Their YouTube movies are displayed in an iframe, if you want those you will have to parse the articles by hand. - # Setting self.recursion to 1, which might resolve this, makes calibre - # downloading a lot of PDF files, which will cause a very, very very, long - # download time - remove_tags.append(dict(name='iframe')) + def print_version(self, url): + if '.com/vdo/' in url: + url = None + return url