from calibre.web.feeds.news import BasicNewsRecipe class Cyberpresse(BasicNewsRecipe): title = u'Cyberpresse' __author__ = 'balok and Sujata Raman' description = 'Canadian news in French' language = 'fr' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True html2lrf_options = ['--left-margin=0', '--right-margin=0', '--top-margin=0', '--bottom-margin=0'] encoding = 'utf-8' keep_only_tags = [dict(name='div', attrs={'class': 'article-page'}), dict(name='div', attrs={'id': 'articlePage'}), ] extra_css = ''' .photodata{font-family:Arial,Helvetica,Verdana,sans-serif;color: #999999; font-size: 90%; } h1{font-family:Georgia,Times,serif ; font-size: large; } .amorce{font-family:Arial,Helvetica,Verdana,sans-serif; font-weight:bold;} .article-page{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} #articlePage{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} .auteur{font-family:Georgia,Times,sans-serif; font-size: 90%; color:#006699 ;} .bodyText{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} .byLine{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: 90%;} .entry{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} .minithumb-auteurs{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: 90%; } a{color:#003399; font-weight:bold; } ''' remove_tags = [ dict(name='div', attrs={ 'class': ['centerbar', 'colspan', 'share-module']}), dict(name='p', attrs={'class': ['zoom']}), dict(name='ul', attrs={'class': ['stories']}), dict(name='h4', attrs={'class': ['general-cat']}), ] feeds = [(u'Manchettes', u'http://www.cyberpresse.ca/rss/225.xml'), (u'Capitale nationale', u'http://www.cyberpresse.ca/rss/501.xml'), (u'Opinions', u'http://www.cyberpresse.ca/rss/977.xml'), (u'Insolite', u'http://www.cyberpresse.ca/rss/279.xml') ] def postprocess_html(self, soup, first): for tag in soup.findAll(name=['i', 'strong']): tag.name = 'div' return soup