Improved recipe for CNN

This commit is contained in:
Kovid Goyal 2009-08-31 13:05:13 -06:00
parent c1eceec7e7
commit 330779fa13

View File

@ -3,7 +3,6 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
''' '''
Profile to download CNN Profile to download CNN
''' '''
import re
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class CNN(BasicNewsRecipe): class CNN(BasicNewsRecipe):
@ -11,28 +10,53 @@ class CNN(BasicNewsRecipe):
title = 'CNN' title = 'CNN'
description = 'Global news' description = 'Global news'
timefmt = ' [%d %b %Y]' timefmt = ' [%d %b %Y]'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal and Sujata Raman'
language = _('English') language = _('English')
no_stylesheets = True no_stylesheets = True
use_embedded_content = False use_embedded_content = False
oldest_article = 15 oldest_article = 15
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ extra_css = '''
(r'<head>.*?<title', lambda match : '<head><title'), h1{font-family :Arial,Helvetica,sans-serif; font-size:large}
(r'</title>.*?</head>', lambda match : '</title></head>'), h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
(r'<body.*?<\!\-\-Article.*?>', lambda match : '<body>'), .cnnTxtCmpnt{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
(r'<\!\-\-Article End\-\->.*?</body>', lambda match : '</body>'), .cnnTMcontent{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#575757}
(r'(</h\d>)<ul>.*?</ul>', lambda match : match.group(1)), # drop story highlights .storytext{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
(r'<h2>(.*?)</h2><h1>(.*?)</h1>', lambda match : '<h1>' + match.group(1) + '</h1><h2>' + match.group(2) + '</h2>'), # sports uses h2 for main title and h1 for subtitle (???) switch these around .storybyline{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757}
(r'<span class="cnnEmbeddedMosLnk">.*?</span>', lambda match : ''), # drop 'watch more' links .credit{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757}
(r'(<div class="cnnstorybody">).*?(<p)', lambda match : match.group(1) + match.group(2)), # drop sports photos .storyBrandingBanner{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757}
(r'</?table.*?>|</?tr.*?>|</?td.*?>', lambda match : ''), # drop table formatting .storytimestamp{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757}
(r'<div class="cnnendofstorycontent".*?>.*?</div>', lambda match : ''), # drop extra business links .timestamp{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757}
(r'<a href="#TOP">.*?</a>', lambda match : '') # drop business 'to top' link .subhead p{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;}
] ] .cnnStoryContent{font-family :Arial,Helvetica,sans-serif; font-size:xx-small}
.cnnContentContainer{font-family :Arial,Helvetica,sans-serif; font-size:xx-small}
.col1{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;}
.col3{color:#333333; font-family :Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold;}
.cnnInlineT1Caption{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;font-weight:bold;}
.cnnInlineT1Credit{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#333333;}
.col10{color:#5A637E}
.cnnTimeStamp{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#333333;}
.galleryhedDek{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#575757;}
.galleryWidgetHeader{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#004276;}
.article-content{font-family :Arial,Helvetica,sans-serif; font-size:xx-small}
.cnnRecapStory{font-family :Arial,Helvetica,sans-serif; font-size:xx-small}
'''
keep_only_tags = [
dict(name='div', attrs={'class':["cnnWCBoxContent","cnnContent","cnnMainBodySecs"]}),
dict(name='div', attrs={'id':["contentBody","content"]}),
dict(name='td', attrs={'id':["cnnRecapStory"]}),]
remove_tags = [
dict(name='div', attrs={'class':["storyLink","article-tools clearfix","widget video related-video vList","cnnFooterBox","scrollArrows","boxHeading","cnnInlineMailbag","mainCol_lastBlock","cnn_bookmarks","cnnFooterBox","cnnEndOfStory","cnnInlineSL","cnnStoryHighlights","cnnFooterClick","cnnSnapShotHeader","cnnStoryToolsFooter","cnnWsnr","cnnUGCBox","cnnTopNewsModule","cnnStoryElementBox","cnnStoryPhotoBoxNavigation"]}),
dict(name='span', attrs={'class':["cnnEmbeddedMosLnk"]}),
dict(name='div', attrs={'id':["cnnIncldHlder","articleCommentsContainer","featuredContent","superstarsWidget","shareMenuContainer","rssMenuContainer","storyBrandingBanner","cnnRightCol","siteFeatures","quigo628","rightColumn","clickIncludeBox","cnnHeaderRightCol","cnnSCFontLabel","cnnSnapShotBottomRight","cnnSCFontButtons","rightColumn"]}),
dict(name='p', attrs={'class':["cnnTopics"]}),
dict(name='td', attrs={'class':["cnnRightRail"]}),
dict(name='table', attrs={'class':["cnnTMbox"]}),
dict(name='ul', attrs={'id':["cnnTopNav","cnnBotNav","cnnSBNav"]}),
]
def print_version(self, url): # def print_version(self, url):
return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url # return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url
feeds = [ feeds = [
('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'), ('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),