calibre/recipes/cnn.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

96 lines
3.6 KiB
Plaintext

__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Profile to download CNN
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class CNN(BasicNewsRecipe):
title = 'CNN'
description = 'Global news'
timefmt = ' [%d %b %Y]'
__author__ = 'Kovid Goyal'
language = 'en'
no_stylesheets = True
use_embedded_content = False
oldest_article = 15
ignore_duplicate_articles = {'url'}
# recursions = 1
# match_regexps = [r'http://sportsillustrated.cnn.com/.*/[1-9].html']
max_articles_per_feed = 25
compress_news_images = True
compress_news_images_auto_size = 12
extra_css = '''
h1 {font-size:xx-large; font-family:Arial,Helvetica,sans-serif;}
.cnn_story_author, .cnn_stryathrtmp {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycaptiontxt, .cnnArticleGalleryPhotoContainer {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycbftrtxt, .cnnEditorialNote {font-size:xx-small; color:#4D4D4D; font-family:Arial,Helvetica,sans-serif;}
.cnn_strycntntlft {font-size:medium; font-family:Arial,Helvetica,sans-serif;}
'''
preprocess_regexps = [
(re.compile(r'<!--\[if.*if\]-->', re.DOTALL), lambda m: ''),
(re.compile(r'<script.*?</script>', re.DOTALL), lambda m: ''),
(re.compile(r'<style.*?</style>', re.DOTALL), lambda m: ''),
]
keep_only_tags = [
dict(id=['body-text', 'storycontent']),
dict(attrs={'class': ['pg-headline', 'metadata']}),
]
remove_tags = [
dict(attrs={'class': lambda x: x and bool({
'video__end-slate', 'owl-filmstrip', 'el-embed-instagram',
}.intersection(set(x.split())))}),
]
feeds = [
('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),
('World', 'http://rss.cnn.com/rss/cnn_world.rss'),
('U.S.', 'http://rss.cnn.com/rss/cnn_us.rss'),
# ('Sports', 'http://rss.cnn.com/rss/si_topstories.rss'),
('Business', 'http://rss.cnn.com/rss/money_latest.rss'),
('Politics', 'http://rss.cnn.com/rss/cnn_allpolitics.rss'),
('Law', 'http://rss.cnn.com/rss/cnn_law.rss'),
('Technology', 'http://rss.cnn.com/rss/cnn_tech.rss'),
('Science & Space', 'http://rss.cnn.com/rss/cnn_space.rss'),
('Health', 'http://rss.cnn.com/rss/cnn_health.rss'),
('Entertainment', 'http://rss.cnn.com/rss/cnn_showbiz.rss'),
('Education', 'http://rss.cnn.com/rss/cnn_education.rss'),
('Offbeat', 'http://rss.cnn.com/rss/cnn_offbeat.rss'),
('Most Popular', 'http://rss.cnn.com/rss/cnn_mostpopular.rss')
]
def preprocess_html(self, soup):
body = soup.find('body')
for h2 in soup.findAll(attrs={'class': 'pg-headline'}):
h2.extract()
body.insert(0, h2)
for img in soup.findAll('img', attrs={'data-src-medium': True}):
img['src'] = img['data-src-medium']
return soup
def get_article_url(self, article):
ans = BasicNewsRecipe.get_article_url(self, article)
ans = ans.partition('?')[0]
if '.com/videos/' in ans:
ans = None
return ans
def get_masthead_url(self):
masthead = 'http://i.cdn.turner.com/cnn/.element/img/3.0/global/header/intl/hdr-globe-central.gif'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(masthead)
except:
self.log("\nCover unavailable")
masthead = None
return masthead