From 330779fa13700f2e2d7b4745e348a94fbc64b703 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 31 Aug 2009 13:05:13 -0600 Subject: [PATCH] Improved recipe for CNN --- src/calibre/web/feeds/recipes/recipe_cnn.py | 58 +++++++++++++++------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_cnn.py b/src/calibre/web/feeds/recipes/recipe_cnn.py index 369aff2e99..f9aef380f9 100644 --- a/src/calibre/web/feeds/recipes/recipe_cnn.py +++ b/src/calibre/web/feeds/recipes/recipe_cnn.py @@ -3,7 +3,6 @@ __copyright__ = '2008, Kovid Goyal ' ''' Profile to download CNN ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class CNN(BasicNewsRecipe): @@ -11,28 +10,53 @@ class CNN(BasicNewsRecipe): title = 'CNN' description = 'Global news' timefmt = ' [%d %b %Y]' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' language = _('English') no_stylesheets = True use_embedded_content = False oldest_article = 15 - preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ - (r'.*?.*?', lambda match : ''), - (r'', lambda match : ''), - (r'<\!\-\-Article End\-\->.*?', lambda match : ''), - (r'()
    .*?
', lambda match : match.group(1)), # drop story highlights - (r'

(.*?)

(.*?)

', lambda match : '

' + match.group(1) + '

' + match.group(2) + '

'), # sports uses h2 for main title and h1 for subtitle (???) switch these around - (r'.*?', lambda match : ''), # drop 'watch more' links - (r'(
).*?(||', lambda match : ''), # drop table formatting - (r'
.*?
', lambda match : ''), # drop extra business links - (r'.*?', lambda match : '') # drop business 'to top' link - ] ] + extra_css = ''' + h1{font-family :Arial,Helvetica,sans-serif; font-size:large} + h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small} + .cnnTxtCmpnt{font-family :Arial,Helvetica,sans-serif; font-size:x-small} + .cnnTMcontent{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#575757} + .storytext{font-family :Arial,Helvetica,sans-serif; font-size:x-small} + .storybyline{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757} + .credit{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757} + .storyBrandingBanner{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757} + .storytimestamp{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757} + .timestamp{font-family :Arial,Helvetica,sans-serif; font-size:xx-small; color:#575757} + .subhead p{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;} + .cnnStoryContent{font-family :Arial,Helvetica,sans-serif; font-size:xx-small} + .cnnContentContainer{font-family :Arial,Helvetica,sans-serif; font-size:xx-small} + .col1{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666;} + .col3{color:#333333; font-family :Arial,Helvetica,sans-serif; font-size:x-small;font-weight:bold;} + .cnnInlineT1Caption{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;font-weight:bold;} + .cnnInlineT1Credit{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#333333;} + .col10{color:#5A637E} + .cnnTimeStamp{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#333333;} + .galleryhedDek{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#575757;} + .galleryWidgetHeader{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;color:#004276;} + .article-content{font-family :Arial,Helvetica,sans-serif; font-size:xx-small} + .cnnRecapStory{font-family :Arial,Helvetica,sans-serif; font-size:xx-small} + ''' + keep_only_tags = [ + dict(name='div', attrs={'class':["cnnWCBoxContent","cnnContent","cnnMainBodySecs"]}), + dict(name='div', attrs={'id':["contentBody","content"]}), + dict(name='td', attrs={'id':["cnnRecapStory"]}),] + remove_tags = [ + dict(name='div', attrs={'class':["storyLink","article-tools clearfix","widget video related-video vList","cnnFooterBox","scrollArrows","boxHeading","cnnInlineMailbag","mainCol_lastBlock","cnn_bookmarks","cnnFooterBox","cnnEndOfStory","cnnInlineSL","cnnStoryHighlights","cnnFooterClick","cnnSnapShotHeader","cnnStoryToolsFooter","cnnWsnr","cnnUGCBox","cnnTopNewsModule","cnnStoryElementBox","cnnStoryPhotoBoxNavigation"]}), + dict(name='span', attrs={'class':["cnnEmbeddedMosLnk"]}), + dict(name='div', attrs={'id':["cnnIncldHlder","articleCommentsContainer","featuredContent","superstarsWidget","shareMenuContainer","rssMenuContainer","storyBrandingBanner","cnnRightCol","siteFeatures","quigo628","rightColumn","clickIncludeBox","cnnHeaderRightCol","cnnSCFontLabel","cnnSnapShotBottomRight","cnnSCFontButtons","rightColumn"]}), + dict(name='p', attrs={'class':["cnnTopics"]}), + dict(name='td', attrs={'class':["cnnRightRail"]}), + dict(name='table', attrs={'class':["cnnTMbox"]}), + dict(name='ul', attrs={'id':["cnnTopNav","cnnBotNav","cnnSBNav"]}), + ] - def print_version(self, url): - return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url + # def print_version(self, url): + # return 'http://www.printthis.clickability.com/pt/printThis?clickMap=printThis&fb=Y&url=' + url feeds = [ ('Top News', 'http://rss.cnn.com/rss/cnn_topstories.rss'),