diff --git a/resources/recipes/telegraph_uk.recipe b/resources/recipes/telegraph_uk.recipe index 5d51bebba4..2c261987b2 100644 --- a/resources/recipes/telegraph_uk.recipe +++ b/resources/recipes/telegraph_uk.recipe @@ -9,8 +9,8 @@ from calibre.web.feeds.news import BasicNewsRecipe class TelegraphUK(BasicNewsRecipe): title = u'Telegraph.co.uk' - __author__ = 'Darko Miletic' - description = 'News from United Kingdom' + __author__ = 'Darko Miletic and Sujata Raman' + description = 'News from United Kingdom' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True @@ -18,23 +18,26 @@ class TelegraphUK(BasicNewsRecipe): use_embedded_content = False - extra_css = ''' - h1{font-family :Arial,Helvetica,sans-serif; font-size:large; } - h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444} - .story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} - .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - a{color:#234B7B; } - .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - ''' - - keep_only_tags = [ + extra_css = ''' + h1{font-family :Arial,Helvetica,sans-serif; font-size:large; } + h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444;} + .story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + a{color:#234B7B; } + .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + ''' + + keep_only_tags = [ dict(name='div', attrs={'class':'storyHead'}) ,dict(name='div', attrs={'class':'story' }) - #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] }) - ] - remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder"]})] - - feeds = [ + #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] }) + ] + remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']}) + #,dict(name='div', attrs={'class':['toolshideoneQuarter']}) + ,dict(name='span', attrs={'class':['num','placeComment']}) + ] + + feeds = [ (u'UK News' , u'http://www.telegraph.co.uk/news/uknews/rss' ) ,(u'World News' , u'http://www.telegraph.co.uk/news/worldnews/rss' ) ,(u'Politics' , u'http://www.telegraph.co.uk/news/newstopics/politics/rss' ) @@ -45,15 +48,27 @@ class TelegraphUK(BasicNewsRecipe): ,(u'Earth News' , u'http://www.telegraph.co.uk/earth/earthnews/rss' ) ,(u'Comment' , u'http://www.telegraph.co.uk/comment/rss' ) ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss' ) - ] + ] def get_article_url(self, article): - + url = article.get('guid', None) - + if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url : url = None - + return url - + + def postprocess_html(self,soup,first): + + for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}): + for pTag in bylineTag.findAll(name='p'): + if getattr(pTag.contents[0],"Comments",True): + pTag.extract() + return soup + + + + + diff --git a/resources/recipes/toronto_sun.recipe b/resources/recipes/toronto_sun.recipe index 996b27c1bd..6fd9438c42 100644 --- a/resources/recipes/toronto_sun.recipe +++ b/resources/recipes/toronto_sun.recipe @@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class TorontoSun(BasicNewsRecipe): title = 'Toronto SUN' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'News from Canada' publisher = 'Toronto Sun' category = 'news, politics, Canada' @@ -21,25 +21,50 @@ class TorontoSun(BasicNewsRecipe): encoding = 'cp1252' language = 'en_CA' - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } - keep_only_tags =[ - dict(name='div', attrs={'class':'articleHead'}) - ,dict(name='div', attrs={'id':'channelContent'}) - ] - remove_tags = [ - dict(name='div',attrs={'class':['leftBox','bottomBox clear','bottomBox','breadCrumb']}) - ,dict(name=['link','iframe','object']) - ,dict(name='a',attrs={'rel':'swap'}) - ,dict(name='ul',attrs={'class':'tabs dl contentSwap'}) - ] + keep_only_tags = [ + dict(name='div', attrs={'class':['articleHead','leftBox']}) + ,dict(name='div', attrs={'id':'channelContent'}) + ,dict(name='div', attrs={'id':'rotateBox'}) + ,dict(name='img') + ] + remove_tags = [ + dict(name='div',attrs={'class':['bottomBox clear','bottomBox','breadCrumb','articleControls thin','articleControls thin short','extraVideoList']}) + ,dict(name='h2',attrs={'class':'microhead'}) + ,dict(name='div',attrs={'id':'commentsBottom'}) + ,dict(name=['link','iframe','object']) + ,dict(name='a',attrs={'rel':'swap'}) + ,dict(name='a',attrs={'href':'/news/haiti/'}) + ,dict(name='ul',attrs={'class':['tabs dl contentSwap','micrositeNav clearIt hList','galleryNav rotateNav']}) + ] + + remove_tags_after = [ + dict(name='div',attrs={'class':'bottomBox clear'}) + ,dict(name='div',attrs={'class':'rotateBox'}) + ,dict(name='div',attrs={'id':'contentSwap'}) + ] + + + extra_css = ''' + h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} + h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;} + h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} + p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} + .bold{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;color:#444444;margin-left: 0px;} + .subheading{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000; font-weight: bold;} + .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + .byline span{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small; text-transform: uppercase;} + .updated{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + .galleryCaption{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + .galleryUpdated{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + ''' - remove_tags_after = dict(name='div',attrs={'class':'bottomBox clear'}) feeds = [ (u'News' , u'http://www.torontosun.com/news/rss.xml' ) @@ -48,3 +73,19 @@ class TorontoSun(BasicNewsRecipe): ,(u'World' , u'http://www.torontosun.com/news/world/rss.xml' ) ,(u'Money' , u'http://www.torontosun.com/money/rss.xml' ) ] + + def preprocess_html(self, soup): + ##To fetch images from the specified source + for img in soup.findAll('img', src=True): + url= img.get('src').split('?')[-1].partition('=')[-1] + if url: + img['src'] = url.split('&')[0].partition('=')[0] + img['width'] = url.split('&')[-1].partition('=')[-1].split('x')[0] + img['height'] =url.split('&')[-1].partition('=')[-1].split('x')[1] + return soup + + + + + +