diff --git a/recipes/le_monde.recipe b/recipes/le_monde.recipe index c597f32586..a443c254c8 100644 --- a/recipes/le_monde.recipe +++ b/recipes/le_monde.recipe @@ -42,6 +42,8 @@ class LeMonde(BasicNewsRecipe): def preprocess_html(self, soup): for lgd in soup.findAll(id="lgd"): lgd.contents[-1].extract() + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] return soup def get_article_url(self, article): diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 2b0f264d1e..a045e6cd27 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -2,13 +2,7 @@ __author__ = 'S. Durand ' __license__ = 'GPL v3' -import time -import re - -from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ptempfile import PersistentTemporaryFile -from urllib2 import HTTPError def classes(classes): @@ -27,11 +21,7 @@ class LeMonde(BasicNewsRecipe): needs_subscription = True - date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' login_url = 'https://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' - journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' - masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/800px-Le_Monde_logo.svg.png' - couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' extra_css = ''' img{max-width:100%} @@ -47,19 +37,19 @@ class LeMonde(BasicNewsRecipe): dict(itemprop=['articleBody']), ] - def __init__(self, options, log, progress_reporter): - BasicNewsRecipe.__init__(self, options, log, progress_reporter) - br = BasicNewsRecipe.get_browser(self) - second = time.time() + 24 * 60 * 60 - for i in range(7): - self.date = time.gmtime(second) - try: - br.open(time.strftime(self.date_url, self.date)) - break - except HTTPError: - second -= 24 * 60 * 60 - self.timefmt = strftime( - u" %A %d %B %Y", self.date).replace(u' 0', u' ') + lm_sections = [ + 'international:International', + 'politique:Politique', + 'societe:Société', + 'economie:Éco', + 'culture:Culture', + 'idees:Idées', + 'planete:Planète', + 'sport:Sport', + 'sciences:Sciences', + 'pixels:Pixels', + 'campus:Campus' + ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -71,55 +61,55 @@ class LeMonde(BasicNewsRecipe): return br def get_cover_url(self): - url = time.strftime(self.couverture_url, self.date) - return url + cover_url = None + soup = self.index_to_soup( + 'http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html') + link_item = soup.find('div', attrs={'class': 'pg-gch'}) - def parse_index(self): - url = time.strftime(self.journal_url, self.date) - soup = self.index_to_soup(url).sommaire - sections = [] - try: - for sec in soup.findAll("section"): - articles = [] - if sec['cahier'] != "Le Monde": - for col in sec.findAll("fnts"): - col.extract() - if sec['cahier'] == "Le Monde Magazine": - continue - for art in sec.findAll("art"): - if art.txt.string and art.ttr.string: - if art.find(['url']): - art.insert(6, '
') - if art.find(['lgd']) and art.find(['lgd']).string: - art.insert(7, '
' + - art.find(['lgd']).string + '
') + if link_item and link_item.img: + cover_url = link_item.img['src'] - def guillemets(match): - if match.group(1) == u"=": - return match.group(0) - return u'%s« %s »' % (match.group(1), match.group(2)) - - article = "" + \ - unicode(art) + "" - article = article.replace( - '', '').replace(' oC ', '°C ') - article = article.replace('srttr>', 'h3>').replace( - 'ssttr>', 'h2>').replace('ttr>', 'h1>') - article = article.replace("'", u'\u2019') - article = re.sub('(.|^)"([^"]+)"', guillemets, article) - - f = PersistentTemporaryFile() - f.write(article) - articles.append( - {'title': art.ttr.string, 'url': "file:///" + f.name}) - sections.append((sec['nom'], articles)) - except AttributeError: - self.log( - "Vos identifiants sont incorrects, ou votre abonnement LeMonde.fr ne vous permet pas de télécharger le journal.") - return sections + return cover_url def preprocess_html(self, soup): for lgd in soup.findAll(id="lgd"): lgd.contents[-1].extract() + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] return soup + + def parse_index(self): + ans = [] + for x in self.lm_sections: + s, section_title = x.partition(':')[::2] + self.log('Processing section', section_title, '...') + articles = list(self.parse_section('http://www.lemonde.fr/%s/' % s)) + if articles: + ans.append((section_title, articles)) + return ans + + def parse_section(self, url): + soup = self.index_to_soup(url) + container = soup.find(attrs={'class':lambda x: x and 'grid_12 alpha' in x}) + for article in container.findAll('article'): + h2 = article.find('h2') + if h2 is None: + h2 = article.find('h3') + if h2 is None: + continue + a = h2.find('a', href=True) + if a is None: + a = h2.findParents('a', href=True) + if not a: + continue + a = a[0] + url = a['href'] + if url.startswith('/'): + url = 'http://www.lemonde.fr' + url + title = self.tag_to_string(a) + desc = '' + p = article.find('p') + if p is not None: + desc = self.tag_to_string(p) + self.log('\tFound article', title, 'at', url) + yield {'title': title, 'url': url, 'description': desc}