diff --git a/resources/images/news/sueddeutschezeitung.png b/resources/images/news/sueddeutschezeitung.png new file mode 100644 index 0000000000..f6ed36cd91 Binary files /dev/null and b/resources/images/news/sueddeutschezeitung.png differ diff --git a/resources/recipes/sueddeutschezeitung.recipe b/resources/recipes/sueddeutschezeitung.recipe new file mode 100644 index 0000000000..260c5a012a --- /dev/null +++ b/resources/recipes/sueddeutschezeitung.recipe @@ -0,0 +1,107 @@ + +__license__ = 'GPL v3' +__copyright__ = '2010, Darko Miletic ' +''' +www.sueddeutsche.de/sz/ +''' + +import urllib +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class SueddeutcheZeitung(BasicNewsRecipe): + title = 'Sueddeutche Zeitung' + __author__ = 'Darko Miletic' + description = 'News from Germany. Access to paid content.' + publisher = 'Sueddeutche Zeitung' + category = 'news, politics, Germany' + no_stylesheets = True + oldest_article = 2 + encoding = 'cp1252' + needs_subscription = True + remove_empty_feeds = True + PREFIX = 'http://www.sueddeutsche.de' + INDEX = PREFIX + strftime('/sz/%Y-%m-%d/') + LOGIN = PREFIX + '/app/lbox/index.html' + use_embedded_content = False + masthead_url = 'http://pix.sueddeutsche.de/img/g_.gif' + language = 'de_DE' + extra_css = ' body{font-family: Arial,Helvetica,sans-serif} ' + + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + , 'linearize_tables' : True + } + + remove_attributes = ['height','width'] + + def get_browser(self): + br = BasicNewsRecipe.get_browser() + br.open(self.INDEX) + if self.username is not None and self.password is not None: + data = urllib.urlencode({ 'login_name':self.username + ,'login_passwort':self.password + ,'lboxaction':'doLogin' + ,'passtxt':'Passwort' + ,'referer':self.INDEX + ,'x':'22' + ,'y':'7' + }) + br.open(self.LOGIN,data) + return br + + remove_tags =[ + dict(attrs={'class':'hidePrint'}) + ,dict(name=['link','object','embed','base','iframe']) + ] + remove_tags_before = dict(name='h2') + remove_tags_after = dict(attrs={'class':'author'}) + + feeds = [ + (u'Politik' , INDEX + 'politik/' ) + ,(u'Seite drei' , INDEX + 'seitedrei/' ) + ,(u'Meinungsseite', INDEX + 'meinungsseite/') + ,(u'Wissen' , INDEX + 'wissen/' ) + ,(u'Panorama' , INDEX + 'panorama/' ) + ,(u'Feuilleton' , INDEX + 'feuilleton/' ) + ,(u'Medien' , INDEX + 'medien/' ) + ,(u'Wirtschaft' , INDEX + 'wirtschaft/' ) + ,(u'Sport' , INDEX + 'sport/' ) + ,(u'Bayern' , INDEX + 'bayern/' ) + ,(u'Muenchen' , INDEX + 'muenchen/' ) + ,(u'jetzt.de' , INDEX + 'jetzt.de/' ) + ] + + def parse_index(self): + totalfeeds = [] + lfeeds = self.get_feeds() + for feedobj in lfeeds: + feedtitle, feedurl = feedobj + self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl)) + articles = [] + soup = self.index_to_soup(feedurl) + tbl = soup.find(attrs={'class':'szprintd'}) + for item in tbl.findAll(name='td',attrs={'class':'topthema'}): + atag = item.find(attrs={'class':'Titel'}).a + ptag = item.find('p') + stag = ptag.find('script') + if stag: + stag.extract() + url = self.PREFIX + atag['href'] + title = self.tag_to_string(atag) + description = self.tag_to_string(ptag) + articles.append({ + 'title' :title + ,'date' :strftime(self.timefmt) + ,'url' :url + ,'description':description + }) + totalfeeds.append((feedtitle, articles)) + return totalfeeds + + def print_version(self, url): + return url + 'print.html' + diff --git a/resources/recipes/telegraph_uk.recipe b/resources/recipes/telegraph_uk.recipe index 5d51bebba4..2c261987b2 100644 --- a/resources/recipes/telegraph_uk.recipe +++ b/resources/recipes/telegraph_uk.recipe @@ -9,8 +9,8 @@ from calibre.web.feeds.news import BasicNewsRecipe class TelegraphUK(BasicNewsRecipe): title = u'Telegraph.co.uk' - __author__ = 'Darko Miletic' - description = 'News from United Kingdom' + __author__ = 'Darko Miletic and Sujata Raman' + description = 'News from United Kingdom' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True @@ -18,23 +18,26 @@ class TelegraphUK(BasicNewsRecipe): use_embedded_content = False - extra_css = ''' - h1{font-family :Arial,Helvetica,sans-serif; font-size:large; } - h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444} - .story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} - .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - a{color:#234B7B; } - .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} - ''' - - keep_only_tags = [ + extra_css = ''' + h1{font-family :Arial,Helvetica,sans-serif; font-size:large; } + h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444;} + .story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + a{color:#234B7B; } + .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + ''' + + keep_only_tags = [ dict(name='div', attrs={'class':'storyHead'}) ,dict(name='div', attrs={'class':'story' }) - #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] }) - ] - remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder"]})] - - feeds = [ + #,dict(name='div', attrs={'class':['slideshowHD gutterUnder',"twoThirds gutter","caption" ] }) + ] + remove_tags = [dict(name='div', attrs={'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide']}) + #,dict(name='div', attrs={'class':['toolshideoneQuarter']}) + ,dict(name='span', attrs={'class':['num','placeComment']}) + ] + + feeds = [ (u'UK News' , u'http://www.telegraph.co.uk/news/uknews/rss' ) ,(u'World News' , u'http://www.telegraph.co.uk/news/worldnews/rss' ) ,(u'Politics' , u'http://www.telegraph.co.uk/news/newstopics/politics/rss' ) @@ -45,15 +48,27 @@ class TelegraphUK(BasicNewsRecipe): ,(u'Earth News' , u'http://www.telegraph.co.uk/earth/earthnews/rss' ) ,(u'Comment' , u'http://www.telegraph.co.uk/comment/rss' ) ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss' ) - ] + ] def get_article_url(self, article): - + url = article.get('guid', None) - + if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url : url = None - + return url - + + def postprocess_html(self,soup,first): + + for bylineTag in soup.findAll(name='div', attrs={'class':'byline'}): + for pTag in bylineTag.findAll(name='p'): + if getattr(pTag.contents[0],"Comments",True): + pTag.extract() + return soup + + + + + diff --git a/resources/recipes/toronto_sun.recipe b/resources/recipes/toronto_sun.recipe index 996b27c1bd..6fd9438c42 100644 --- a/resources/recipes/toronto_sun.recipe +++ b/resources/recipes/toronto_sun.recipe @@ -10,7 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe class TorontoSun(BasicNewsRecipe): title = 'Toronto SUN' - __author__ = 'Darko Miletic' + __author__ = 'Darko Miletic and Sujata Raman' description = 'News from Canada' publisher = 'Toronto Sun' category = 'news, politics, Canada' @@ -21,25 +21,50 @@ class TorontoSun(BasicNewsRecipe): encoding = 'cp1252' language = 'en_CA' - conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language - } + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language + } - keep_only_tags =[ - dict(name='div', attrs={'class':'articleHead'}) - ,dict(name='div', attrs={'id':'channelContent'}) - ] - remove_tags = [ - dict(name='div',attrs={'class':['leftBox','bottomBox clear','bottomBox','breadCrumb']}) - ,dict(name=['link','iframe','object']) - ,dict(name='a',attrs={'rel':'swap'}) - ,dict(name='ul',attrs={'class':'tabs dl contentSwap'}) - ] + keep_only_tags = [ + dict(name='div', attrs={'class':['articleHead','leftBox']}) + ,dict(name='div', attrs={'id':'channelContent'}) + ,dict(name='div', attrs={'id':'rotateBox'}) + ,dict(name='img') + ] + remove_tags = [ + dict(name='div',attrs={'class':['bottomBox clear','bottomBox','breadCrumb','articleControls thin','articleControls thin short','extraVideoList']}) + ,dict(name='h2',attrs={'class':'microhead'}) + ,dict(name='div',attrs={'id':'commentsBottom'}) + ,dict(name=['link','iframe','object']) + ,dict(name='a',attrs={'rel':'swap'}) + ,dict(name='a',attrs={'href':'/news/haiti/'}) + ,dict(name='ul',attrs={'class':['tabs dl contentSwap','micrositeNav clearIt hList','galleryNav rotateNav']}) + ] + + remove_tags_after = [ + dict(name='div',attrs={'class':'bottomBox clear'}) + ,dict(name='div',attrs={'class':'rotateBox'}) + ,dict(name='div',attrs={'id':'contentSwap'}) + ] + + + extra_css = ''' + h1{font-family :Arial,Helvetica,sans-serif; font-size:large;} + h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;} + h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;} + p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} + .bold{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;color:#444444;margin-left: 0px;} + .subheading{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000; font-weight: bold;} + .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + .byline span{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small; text-transform: uppercase;} + .updated{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;} + .galleryCaption{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + .galleryUpdated{font-family :Arial,Helvetica,sans-serif; font-size: x-small;} + ''' - remove_tags_after = dict(name='div',attrs={'class':'bottomBox clear'}) feeds = [ (u'News' , u'http://www.torontosun.com/news/rss.xml' ) @@ -48,3 +73,19 @@ class TorontoSun(BasicNewsRecipe): ,(u'World' , u'http://www.torontosun.com/news/world/rss.xml' ) ,(u'Money' , u'http://www.torontosun.com/money/rss.xml' ) ] + + def preprocess_html(self, soup): + ##To fetch images from the specified source + for img in soup.findAll('img', src=True): + url= img.get('src').split('?')[-1].partition('=')[-1] + if url: + img['src'] = url.split('&')[0].partition('=')[0] + img['width'] = url.split('&')[-1].partition('=')[-1].split('x')[0] + img['height'] =url.split('&')[-1].partition('=')[-1].split('x')[1] + return soup + + + + + +