mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-30 23:00:21 -04:00
Updated Le Monde
This commit is contained in:
parent
a48395b9db
commit
ceca434483
@ -1,106 +1,89 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008, Mathieu Godlewski <mathieu at godlewski.fr>'
|
|
||||||
'''
|
|
||||||
lemonde.fr
|
|
||||||
'''
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
class LeMonde(BasicNewsRecipe):
|
class LeMonde(BasicNewsRecipe):
|
||||||
title = 'LeMonde.fr'
|
title = 'Le Monde'
|
||||||
__author__ = 'Mathieu Godlewski and Sujata Raman'
|
__author__ = 'veezh'
|
||||||
description = 'Global news in french'
|
description = 'Actualités'
|
||||||
oldest_article = 3
|
oldest_article = 1
|
||||||
language = 'fr'
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
#delay = 1
|
||||||
|
use_embedded_content = False
|
||||||
|
encoding = 'cp1252'
|
||||||
|
publisher = 'lemonde.fr'
|
||||||
|
language = 'fr'
|
||||||
|
conversion_options = {
|
||||||
|
'comments' : description
|
||||||
|
,'language' : language
|
||||||
|
,'publisher' : publisher
|
||||||
|
,'linearize_tables': True
|
||||||
|
}
|
||||||
|
|
||||||
max_articles_per_feed = 30
|
remove_empty_feeds = True
|
||||||
no_stylesheets = True
|
|
||||||
remove_javascript = True
|
|
||||||
|
|
||||||
|
filterDuplicates = True
|
||||||
|
|
||||||
# cover_url='http://abonnes.lemonde.fr/titresdumonde/'+date.today().strftime("%y%m%d")+'/1.jpg'
|
def preprocess_html(self, soup):
|
||||||
|
for alink in soup.findAll('a'):
|
||||||
|
if alink.string is not None:
|
||||||
|
tstr = alink.string
|
||||||
|
alink.replaceWith(tstr)
|
||||||
|
return soup
|
||||||
|
|
||||||
|
preprocess_regexps = [
|
||||||
extra_css = '''
|
(re.compile(r' \''), lambda match: ' ‘'),
|
||||||
.dateline{color:#666666;font-family:verdana,sans-serif;font-size:x-small;}
|
(re.compile(r'\''), lambda match: '’'),
|
||||||
.author{font-family:verdana,sans-serif;font-size:x-small;color:#222222;}
|
(re.compile(r'"<'), lambda match: ' »<'),
|
||||||
.articleImage{color:#666666;font-family:verdana,sans-serif;font-size:x-small;}
|
(re.compile(r'>"'), lambda match: '>« '),
|
||||||
.mainText{font-family:Georgia,serif;color:#222222;}
|
(re.compile(r'’"'), lambda match: '’« '),
|
||||||
.LM_articleText{font-family:Arial,Helvetica,sans-serif;}
|
(re.compile(r' "'), lambda match: ' « '),
|
||||||
.LM_titleZone{font-family:Arial,Helvetica,sans-serif;}
|
(re.compile(r'" '), lambda match: ' » '),
|
||||||
.mainContent{font-family:Georgia,serif;}
|
(re.compile(r'\("'), lambda match: '(« '),
|
||||||
.LM_content{font-family:Georgia,serif;}
|
(re.compile(r'"\)'), lambda match: ' »)'),
|
||||||
.LM_caption{font-family:Georgia,serif;font-size:-small;}
|
(re.compile(r'"\.'), lambda match: ' ».'),
|
||||||
.LM_imageSource{font-family:Arial,Helvetica,sans-serif;font-size:x-small;color:#666666;}
|
(re.compile(r'",'), lambda match: ' »,'),
|
||||||
h1{font-family:Arial,Helvetica,sans-serif;font-size:medium;color:#000000;}
|
(re.compile(r'"\?'), lambda match: ' »?'),
|
||||||
.post{font-family:Arial,Helvetica,sans-serif;}
|
(re.compile(r'":'), lambda match: ' »:'),
|
||||||
.mainTitle{font-family:Georgia,serif;}
|
(re.compile(r'";'), lambda match: ' »;'),
|
||||||
.content{font-family:Georgia,serif;}
|
(re.compile(r'"\!'), lambda match: ' »!'),
|
||||||
.entry{font-family:Georgia,serif;}
|
(re.compile(r' :'), lambda match: ' :'),
|
||||||
h2{font-family:Arial,Helvetica,sans-serif;font-size:large;}
|
(re.compile(r' ;'), lambda match: ' ;'),
|
||||||
small{font-family:Arial,Helvetica,sans-serif; color:#ED1B23;}
|
(re.compile(r' \?'), lambda match: ' ?'),
|
||||||
'''
|
(re.compile(r' \!'), lambda match: ' !'),
|
||||||
|
(re.compile(r'\s»'), lambda match: ' »'),
|
||||||
feeds = [
|
(re.compile(r'«\s'), lambda match: '« '),
|
||||||
('A la Une', 'http://www.lemonde.fr/rss/une.xml'),
|
(re.compile(r' %'), lambda match: ' %'),
|
||||||
('International', 'http://www.lemonde.fr/rss/sequence/0,2-3210,1-0,0.xml'),
|
(re.compile(r'\.jpg » border='), lambda match: '.jpg'),
|
||||||
('Europe', 'http://www.lemonde.fr/rss/sequence/0,2-3214,1-0,0.xml'),
|
(re.compile(r'\.png » border='), lambda match: '.png'),
|
||||||
('Societe', 'http://www.lemonde.fr/rss/sequence/0,2-3224,1-0,0.xml'),
|
|
||||||
('Economie', 'http://www.lemonde.fr/rss/sequence/0,2-3234,1-0,0.xml'),
|
|
||||||
('Medias', 'http://www.lemonde.fr/rss/sequence/0,2-3236,1-0,0.xml'),
|
|
||||||
('Rendez-vous', 'http://www.lemonde.fr/rss/sequence/0,2-3238,1-0,0.xml'),
|
|
||||||
('Sports', 'http://www.lemonde.fr/rss/sequence/0,2-3242,1-0,0.xml'),
|
|
||||||
('Planete', 'http://www.lemonde.fr/rss/sequence/0,2-3244,1-0,0.xml'),
|
|
||||||
('Culture', 'http://www.lemonde.fr/rss/sequence/0,2-3246,1-0,0.xml'),
|
|
||||||
('Technologies', 'http://www.lemonde.fr/rss/sequence/0,2-651865,1-0,0.xml'),
|
|
||||||
('Cinema', 'http://www.lemonde.fr/rss/sequence/0,2-3476,1-0,0.xml'),
|
|
||||||
('Voyages', 'http://www.lemonde.fr/rss/sequence/0,2-3546,1-0,0.xml'),
|
|
||||||
('Livres', 'http://www.lemonde.fr/rss/sequence/0,2-3260,1-0,0.xml'),
|
|
||||||
('Examens', 'http://www.lemonde.fr/rss/sequence/0,2-3404,1-0,0.xml'),
|
|
||||||
('Opinions', 'http://www.lemonde.fr/rss/sequence/0,2-3232,1-0,0.xml')
|
|
||||||
]
|
|
||||||
keep_only_tags = [dict(name='div', attrs={'id':["mainTitle","mainContent","LM_content","content"]}),
|
|
||||||
dict(name='div', attrs={'class':["post"]})
|
|
||||||
]
|
|
||||||
|
|
||||||
remove_tags = [dict(name='img', attrs={'src':'http://medias.lemonde.fr/mmpub/img/lgo/lemondefr_pet.gif'}),
|
|
||||||
dict(name='div', attrs={'id':'xiti-logo-noscript'}),
|
|
||||||
dict(name='br', attrs={}),
|
|
||||||
dict(name='iframe', attrs={}),
|
|
||||||
dict(name='table', attrs={'id':["toolBox"]}),
|
|
||||||
dict(name='table', attrs={'class':["bottomToolBox"]}),
|
|
||||||
dict(name='div', attrs={'class':["pageNavigation","LM_pagination","fenetreBoxesContainer","breakingNews","LM_toolsBottom","LM_comments","LM_tools","pave_meme_sujet_hidden","boxMemeSujet"]}),
|
|
||||||
dict(name='div', attrs={'id':["miniUne","LM_sideBar"]}),
|
|
||||||
]
|
|
||||||
|
|
||||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
|
|
||||||
[
|
|
||||||
(r'<html.*(<div class="post".*?>.*?</div>.*?<div class="entry">.*?</div>).*You can start editing here.*</html>', lambda match : '<html><body>'+match.group(1)+'</body></html>'),
|
|
||||||
(r'<p> </p>', lambda match : ''),
|
|
||||||
(r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>'+match.group(1).upper()),
|
|
||||||
(r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/q(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>"'+match.group(1).upper()),
|
|
||||||
(r'(<div class=desc><b>.*</b></div>).*</body>', lambda match : match.group(1)),
|
|
||||||
]
|
]
|
||||||
]
|
|
||||||
|
|
||||||
article_match_regexps = [ (re.compile(i)) for i in
|
keep_only_tags = [
|
||||||
[
|
dict(name='div', attrs={'class':['contenu']})
|
||||||
(r'http://www\.lemonde\.fr/\S+/article/.*'),
|
]
|
||||||
(r'http://www\.lemonde\.fr/\S+/portfolio/.*'),
|
|
||||||
(r'http://www\.lemonde\.fr/\S+/article_interactif/.*'),
|
|
||||||
(r'http://\S+\.blog\.lemonde\.fr/.*'),
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
# def print_version(self, url):
|
remove_tags_after = [dict(id='appel_temoignage')]
|
||||||
# return re.sub('http://www\.lemonde\.fr/.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url)
|
|
||||||
|
|
||||||
# Used to filter duplicated articles
|
def get_article_url(self, article):
|
||||||
articles_list = []
|
link = article.get('link')
|
||||||
|
if 'blog' not in link:
|
||||||
|
return link
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('A la une', 'http://www.lemonde.fr/rss/une.xml'),
|
||||||
|
('International', 'http://www.lemonde.fr/rss/tag/international.xml'),
|
||||||
|
('Europe', 'http://www.lemonde.fr/rss/tag/europe.xml'),
|
||||||
|
(u'Société', 'http://www.lemonde.fr/rss/tag/societe.xml'),
|
||||||
|
('Economie', 'http://www.lemonde.fr/rss/tag/economie.xml'),
|
||||||
|
(u'Médias', 'http://www.lemonde.fr/rss/tag/actualite-medias.xml'),
|
||||||
|
(u'Planète', 'http://www.lemonde.fr/rss/tag/planete.xml'),
|
||||||
|
('Culture', 'http://www.lemonde.fr/rss/tag/culture.xml'),
|
||||||
|
('Technologies', 'http://www.lemonde.fr/rss/tag/technologies.xml'),
|
||||||
|
('Livres', 'http://www.lemonde.fr/rss/tag/livres.xml'),
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
cover_url = None
|
cover_url = None
|
||||||
@ -111,42 +94,3 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
cover_url = link_item.img['src']
|
cover_url = link_item.img['src']
|
||||||
|
|
||||||
return cover_url
|
return cover_url
|
||||||
|
|
||||||
def get_article_url(self, article):
|
|
||||||
url=article.get('link', None)
|
|
||||||
url=url[0:url.find("#")]
|
|
||||||
if url in self.articles_list:
|
|
||||||
self.log_debug(_('Skipping duplicated article: %s')%url)
|
|
||||||
return False
|
|
||||||
if self.is_article_wanted(url):
|
|
||||||
self.articles_list.append(url)
|
|
||||||
if '/portfolio/' in url or '/video/' in url:
|
|
||||||
url = None
|
|
||||||
return url
|
|
||||||
self.log_debug(_('Skipping filtered article: %s')%url)
|
|
||||||
url = article.get('guid', None)
|
|
||||||
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_article_wanted(self, url):
|
|
||||||
if self.article_match_regexps:
|
|
||||||
for m in self.article_match_regexps:
|
|
||||||
if m.search(url):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
return False
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
|
|
||||||
for item in soup.findAll(style=True):
|
|
||||||
del item['style']
|
|
||||||
|
|
||||||
for item in soup.findAll(face=True):
|
|
||||||
del item['face']
|
|
||||||
for tag in soup.findAll(name=['ul','li']):
|
|
||||||
tag.name = 'div'
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user