calibre/recipes/le_monde.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

153 lines
6.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

__license__ = 'GPL v3'
__copyright__ = '2012'
'''
lemonde.fr
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class LeMonde(BasicNewsRecipe):
title = 'Le Monde'
__author__ = 'veezh'
description = u'Actualités'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
publisher = 'lemonde.fr'
category = 'news, France, world'
language = 'fr'
extra_css = '''
h1{font-size:130%;}
h2{font-size:100%;}
blockquote.aside {background-color: #DDD; padding: 0.5em;}
.ariane{font-size:xx-small;}
.source{font-size:xx-small;}
/*.href{font-size:xx-small;}*/
/*.figcaption style{color:#666666; font-size:x-small;}*/
/*.main-article-info{font-family:Arial,Helvetica,sans-serif;}*/
/*full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}*/
/*match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}*/
'''
conversion_options = {
'comments': description, 'tags': category, 'language': language, 'publisher': publisher, 'linearize_tables': True
}
remove_empty_feeds = True
filterDuplicates = True
def preprocess_html(self, soup):
for aside in soup.findAll('aside'):
aside.name = 'blockquote'
aside['class'] = "aside"
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
preprocess_regexps = [
(re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'),
(re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(
1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)),
(re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) +
' ' + m.group(2) + m.group(3) + m.group(4)),
(re.compile(r'<span>'), lambda match: ' <span>'),
(re.compile(r'\("'), lambda match: '(&laquo;&nbsp;'),
(re.compile(r'"\)'), lambda match: '&nbsp;&raquo;)'),
(re.compile(r'&ldquo;'), lambda match: '(&laquo;&nbsp;'),
(re.compile(r'&rdquo;'), lambda match: '&nbsp;&raquo;)'),
(re.compile(r'>\''), lambda match: '>&lsquo;'),
(re.compile(r' \''), lambda match: ' &lsquo;'),
(re.compile(r' &quot;'), lambda match: ' &laquo;&nbsp;'),
(re.compile(r'>&quot;'), lambda match: '>&laquo;&nbsp;'),
(re.compile(r'&quot;<'), lambda match: '&nbsp;&raquo;<'),
(re.compile(r'&quot; '), lambda match: '&nbsp;&raquo; '),
(re.compile(r'&quot;,'), lambda match: '&nbsp;&raquo;,'),
(re.compile(r'\''), lambda match: '&rsquo;'),
(re.compile(r'"<em>'), lambda match: '<em>&laquo;&nbsp;'),
(re.compile(r'"<em>"</em><em>'), lambda match: '<em>&laquo;&nbsp;'),
(re.compile(r'"<a href='), lambda match: '&laquo;&nbsp;<a href='),
(re.compile(r'</em>"'), lambda match: '&nbsp;&raquo;</em>'),
(re.compile(r'</a>"'), lambda match: '&nbsp;&raquo;</a>'),
(re.compile(r'"</'), lambda match: '&nbsp;&raquo;</'),
(re.compile(r'>"'), lambda match: '>&laquo;&nbsp;'),
(re.compile(r'"<'), lambda match: '&nbsp;&raquo;<'),
(re.compile(r'&rsquo;"'), lambda match: '&rsquo;«&nbsp;'),
(re.compile(r' "'), lambda match: ' &laquo;&nbsp;'),
(re.compile(r'" '), lambda match: '&nbsp;&raquo; '),
(re.compile(r'"\.'), lambda match: '&nbsp;&raquo;.'),
(re.compile(r'",'), lambda match: '&nbsp;&raquo;,'),
(re.compile(r'"\?'), lambda match: '&nbsp;&raquo;?'),
(re.compile(r'":'), lambda match: '&nbsp;&raquo;:'),
(re.compile(r'";'), lambda match: '&nbsp;&raquo;;'),
(re.compile(r'"\!'), lambda match: '&nbsp;&raquo;!'),
(re.compile(r' :'), lambda match: '&nbsp;:'),
(re.compile(r' ;'), lambda match: '&nbsp;;'),
(re.compile(r' \?'), lambda match: '&nbsp;?'),
(re.compile(r' \!'), lambda match: '&nbsp;!'),
(re.compile(r'\s»'), lambda match: '&nbsp;»'),
(re.compile(r'«\s'), lambda match: '«&nbsp;'),
(re.compile(r' %'), lambda match: '&nbsp;%'),
(re.compile(r'\.jpg&nbsp;&raquo; width='), lambda match: '.jpg'),
(re.compile(r'\.png&nbsp;&raquo; width='), lambda match: '.png'),
(re.compile(r' &ndash; '), lambda match: '&nbsp;&ndash; '),
(re.compile(r'figcaption style="display:none"'), lambda match: 'figcaption'),
(re.compile(r' '), lambda match: '&nbsp;&ndash; '),
(re.compile(r' - '), lambda match: '&nbsp;&ndash; '),
(re.compile(r' -,'), lambda match: '&nbsp;&ndash;,'),
(re.compile(r'&raquo;:'), lambda match: '&raquo;&nbsp;:'),
]
keep_only_tags = [
dict(name='article', attrs={})
]
remove_tags = [
dict(attrs={'class': ['rubriques_liees']}),
dict(attrs={'class': ['sociaux']}),
dict(attrs={'class': ['bloc_base meme_sujet']}),
dict(name='p', attrs={'class': ['lire']})
]
remove_tags_after = [dict(id='fb-like')]
def get_article_url(self, article):
url = article.get('guid', None)
if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url:
url = None
return url
# def get_article_url(self, article):
# link = article.get('link')
# if 'blog' not in link and ('chat' not in link):
# return link
feeds = [
('A la une', 'http://www.lemonde.fr/rss/une.xml'),
('International', 'http://www.lemonde.fr/rss/tag/international.xml'),
('Europe', 'http://www.lemonde.fr/rss/tag/europe.xml'),
(u'Société', 'http://www.lemonde.fr/rss/tag/societe.xml'),
('Economie', 'http://www.lemonde.fr/rss/tag/economie.xml'),
(u'Médias', 'http://www.lemonde.fr/rss/tag/actualite-medias.xml'),
(u'Planète', 'http://www.lemonde.fr/rss/tag/planete.xml'),
('Culture', 'http://www.lemonde.fr/rss/tag/culture.xml'),
('Technologies', 'http://www.lemonde.fr/rss/tag/technologies.xml'),
('Livres', 'http://www.lemonde.fr/rss/tag/livres.xml'),
]
def get_cover_url(self):
cover_url = None
soup = self.index_to_soup(
'http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html')
link_item = soup.find('div', attrs={'class': 'pg-gch'})
if link_item and link_item.img:
cover_url = link_item.img['src']
return cover_url