From 7933c43e7aca2c5d5d5afd84a4a8841788171a4c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 15 Mar 2020 18:57:40 +0530 Subject: [PATCH] Update Forbes --- recipes/forbes.recipe | 59 +++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/recipes/forbes.recipe b/recipes/forbes.recipe index a6d7192fce..5ff708618f 100644 --- a/recipes/forbes.recipe +++ b/recipes/forbes.recipe @@ -1,7 +1,4 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import html5lib -import json -import re from calibre.web.feeds.news import BasicNewsRecipe @@ -33,47 +30,37 @@ class Forbes(BasicNewsRecipe): } ''' feeds = [ - (u'Latest', u'http://www.forbes.com/news/index.xml'), - (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'), - (u'Technology', u'http://www.forbes.com/technology/index.xml'), - (u'Business', u'http://www.forbes.com/business/index.xml'), - (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'), - (u'Leadership', u'http://www.forbes.com/leadership/index.xml'), + (u'Latest', u'https://www.forbes.com/news/index.xml'), + (u'Most Popular', u'https://www.forbes.com/feeds/popstories.xml'), + (u'Technology', u'https://www.forbes.com/technology/index.xml'), + (u'Business', u'https://www.forbes.com/business/index.xml'), + (u'Sports Money', u'https://www.forbes.com/sportsmoney/index.xml'), + (u'Leadership', u'https://www.forbes.com/leadership/index.xml'), ] + keep_only_tags = [ + classes('article-headline-container hero-image-block article-body bottom-contrib-block') + ] + + remove_tags = [ + classes('article-sharing'), + dict(name='button'), + ] + + def preprocess_html(self, soup): + h = soup.find(**classes('hero-image-block')) + if h is not None: + h1 = soup.find(**classes('article-headline-container')) + h.extract() + h1.append(h) + return soup + def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.set_cookie('dailyWelcomeCookie', 'true', '.forbes.com') br.set_cookie('welcomeAd', 'true', '.forbes.com') return br - def preprocess_raw_html(self, raw, url): - root = html5lib.parse( - raw, namespaceHTMLElements=False, treebuilder='lxml') - for script in root.xpath('//script'): - if script.text and script.text.startswith('try {'): - idx = script.text.find('fbs_settings.content = {') - if idx > -1: - text = script.text.partition('=')[2].lstrip() - ridx = text.rfind('} catch(err)') - text = text[:ridx].rstrip().rstrip(';') - data = json.loads(text) - # from pprint import pformat - # print(pformat(data), file=open('/t/data.py', 'w')) - break - else: - raise ValueError('Failed to find serialized JSON content') - title = data['brandVoiceTitle'] - body = data['body'] - - def cap(m): - val = m.group() - if val.startswith('[/'): - return '' - return '
' - body = re.sub(r'\[/?caption[^\]]*\]', cap, body) - return '''{0}

{0}

{1}
'''.format(title, body) - # def parse_index(self): # return [('Articles', [{'title':'Test', 'url': # 'http://www.forbes.com/sites/hamdiraini/2016/04/25/bazin-seeks-startups-to-accelerate-accorhotels-transformation/'}])]