diff --git a/resources/recipes/cyberpresse.recipe b/resources/recipes/cyberpresse.recipe index 9e20c11502..5de953df3a 100644 --- a/resources/recipes/cyberpresse.recipe +++ b/resources/recipes/cyberpresse.recipe @@ -7,18 +7,51 @@ class Cyberpresse(BasicNewsRecipe): __author__ = 'balok' description = 'Canadian news in French' language = 'fr' - + oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True + remove_javascript = True html2lrf_options = ['--left-margin=0','--right-margin=0','--top-margin=0','--bottom-margin=0'] + encoding = 'utf-8' - preprocess_regexps = [ - (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''), - (re.compile(r'Agrandir.*?', re.IGNORECASE | re.DOTALL), lambda match : '
'), - ] - - - feeds = [(u'Manchettes', u'http://www.cyberpresse.ca/rss/225.xml'),(u'Capitale nationale', u'http://www.cyberpresse.ca/rss/501.xml'),(u'Opinions', u'http://www.cyberpresse.ca/rss/977.xml'),(u'Insolite', u'http://www.cyberpresse.ca/rss/279.xml')] + + keep_only_tags = [dict(name='div', attrs={'class':'article-page'}), + dict(name='div', attrs={'id':'articlePage'}), + ] + + extra_css = ''' + .photodata{font-family:Arial,Helvetica,Verdana,sans-serif;color: #999999; font-size: 90%; } + h1{font-family:Georgia,Times,serif ; font-size: large; } + .amorce{font-family:Arial,Helvetica,Verdana,sans-serif; font-weight:bold;} + .article-page{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} + #articlePage{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} + .auteur{font-family:Georgia,Times,sans-serif; font-size: 90%; color:#006699 ;} + .bodyText{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} + .byLine{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: 90%;} + .entry{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: x-small;} + .minithumb-auteurs{font-family:Arial,Helvetica,Verdana,sans-serif; font-size: 90%; } + a{color:#003399; font-weight:bold; } + ''' + + remove_tags = [ + dict(name='div', attrs={'class':['centerbar','colspan','share-module']}), + dict(name='p', attrs={'class':['zoom']}), + dict(name='ul', attrs={'class':['stories']}), + dict(name='h4', attrs={'class':['general-cat']}), + ] + + feeds = [(u'Manchettes', u'http://www.cyberpresse.ca/rss/225.xml'), + (u'Capitale nationale', u'http://www.cyberpresse.ca/rss/501.xml'), + (u'Opinions', u'http://www.cyberpresse.ca/rss/977.xml'), + (u'Insolite', u'http://www.cyberpresse.ca/rss/279.xml') + ] + def postprocess_html(self, soup, first): + + for tag in soup.findAll(name=['i','strong']): + tag.name = 'div' + + return soup + +