diff --git a/recipes/forbes.recipe b/recipes/forbes.recipe index a6d7192fce..5ff708618f 100644 --- a/recipes/forbes.recipe +++ b/recipes/forbes.recipe @@ -1,7 +1,4 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import html5lib -import json -import re from calibre.web.feeds.news import BasicNewsRecipe @@ -33,47 +30,37 @@ class Forbes(BasicNewsRecipe): } ''' feeds = [ - (u'Latest', u'http://www.forbes.com/news/index.xml'), - (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'), - (u'Technology', u'http://www.forbes.com/technology/index.xml'), - (u'Business', u'http://www.forbes.com/business/index.xml'), - (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'), - (u'Leadership', u'http://www.forbes.com/leadership/index.xml'), + (u'Latest', u'https://www.forbes.com/news/index.xml'), + (u'Most Popular', u'https://www.forbes.com/feeds/popstories.xml'), + (u'Technology', u'https://www.forbes.com/technology/index.xml'), + (u'Business', u'https://www.forbes.com/business/index.xml'), + (u'Sports Money', u'https://www.forbes.com/sportsmoney/index.xml'), + (u'Leadership', u'https://www.forbes.com/leadership/index.xml'), ] + keep_only_tags = [ + classes('article-headline-container hero-image-block article-body bottom-contrib-block') + ] + + remove_tags = [ + classes('article-sharing'), + dict(name='button'), + ] + + def preprocess_html(self, soup): + h = soup.find(**classes('hero-image-block')) + if h is not None: + h1 = soup.find(**classes('article-headline-container')) + h.extract() + h1.append(h) + return soup + def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.set_cookie('dailyWelcomeCookie', 'true', '.forbes.com') br.set_cookie('welcomeAd', 'true', '.forbes.com') return br - def preprocess_raw_html(self, raw, url): - root = html5lib.parse( - raw, namespaceHTMLElements=False, treebuilder='lxml') - for script in root.xpath('//script'): - if script.text and script.text.startswith('try {'): - idx = script.text.find('fbs_settings.content = {') - if idx > -1: - text = script.text.partition('=')[2].lstrip() - ridx = text.rfind('} catch(err)') - text = text[:ridx].rstrip().rstrip(';') - data = json.loads(text) - # from pprint import pformat - # print(pformat(data), file=open('/t/data.py', 'w')) - break - else: - raise ValueError('Failed to find serialized JSON content') - title = data['brandVoiceTitle'] - body = data['body'] - - def cap(m): - val = m.group() - if val.startswith('[/'): - return '' - return '