mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
50 lines
1.7 KiB
Plaintext
50 lines
1.7 KiB
Plaintext
import re
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
class Forbes(BasicNewsRecipe):
|
|
title = u'Forbes'
|
|
description = 'Business and Financial News'
|
|
__author__ = 'Kovid Goyal'
|
|
oldest_article = 30
|
|
max_articles_per_feed = 20
|
|
language = 'en'
|
|
encoding = 'utf-8'
|
|
recursions = 1
|
|
|
|
no_stylesheets = True
|
|
|
|
cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif'
|
|
|
|
feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
|
|
(u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
|
|
(u'Technology', u'http://www.forbes.com/technology/index.xml'),
|
|
(u'Business', u'http://www.forbes.com/business/index.xml'),
|
|
(u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
|
|
(u'Leadership', u'http://www.forbes.com/leadership/index.xml'),]
|
|
|
|
keep_only_tags = \
|
|
{'class':lambda x: x and (set(x.split()) & {'body', 'pagination',
|
|
'articleHead', 'article_head'})}
|
|
remove_tags_before = {'name':'h1'}
|
|
remove_tags = [
|
|
{'class':['comment_bug', 'engagement_block',
|
|
'video_promo_block', 'article_actions']},
|
|
{'id':'comments'}
|
|
]
|
|
|
|
def is_link_wanted(self, url, tag):
|
|
ans = re.match(r'http://.*/[2-9]/', url) is not None
|
|
if ans:
|
|
self.log('Following multipage link: %s'%url)
|
|
return ans
|
|
|
|
def postprocess_html(self, soup, first_fetch):
|
|
for pag in soup.findAll(True, 'pagination'):
|
|
pag.extract()
|
|
if not first_fetch:
|
|
h1 = soup.find('h1')
|
|
if h1 is not None:
|
|
h1.extract()
|
|
return soup
|
|
|