diff --git a/recipes/le_monde_sub.recipe b/recipes/le_monde_sub.recipe index 56156166dc..dc9fa9d36f 100644 --- a/recipes/le_monde_sub.recipe +++ b/recipes/le_monde_sub.recipe @@ -1,166 +1,94 @@ -#!/usr/bin/env python - +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +__author__ = 'Sylvain Durand ' __license__ = 'GPL v3' -__copyright__ = '2012, 2013, Rémi Vanicat ' -''' -Lemonde.fr: Version abonnée -''' +import time -import os, zipfile, re, time -from urllib2 import HTTPError -from calibre.constants import preferred_encoding - +from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ptempfile import PersistentTemporaryFile +from urllib2 import HTTPError -class LeMondeAbonne(BasicNewsRecipe): +class LeMonde(BasicNewsRecipe): - title = u'Le Monde: Édition abonnés' - __author__ = u'Rémi Vanicat' - description = u'Actualités' - category = u'Actualités, France, Monde' - publisher = 'Le Monde' - language = 'fr' - needs_subscription = True - no_stylesheets = True - smarten_punctuation = True - remove_attributes = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height'] - extra_css = ''' li{margin:6pt 0} - ul{margin:0} + title = u'Le Monde: Édition abonnés' + __author__ = 'Sylvain Durand' + description = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' + language = 'fr' + encoding = 'utf8' - div.photo img{max-width:100%; border:0px transparent solid;} - div.photo{font-family:inherit; color:#333; text-align:center;} - div.photo p{text-align:justify;font-size:.9em; line-height:.9em;} + needs_subscription = True - @page{margin:10pt} - .ar-txt {color:#000; text-align:justify;} - h1{text-align:left; font-size:1.25em;} + date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' + login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' + masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png' + couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' - .auteur{text-align:right; font-weight:bold} - .feed{text-align:right; font-weight:bold} - .po-ti2{font-weight:bold} - .fen-tt{font-weight:bold;font-size:1.1em} - ''' + extra_css = ''' + img{max-width:100%} + h1{font-size:1.2em !important; line-height:1.2em !important; } + h2{font-size:1em !important; line-height:1em !important; } + h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} + #photo{text-align:center !important; margin:10px 0 -8px;} + #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' - zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' - coverurl_format = '/img/%y%m%d01.jpg' - path_format = "%y%m%d" - login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' + keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] - keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ] - - - remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })] - - article_id_pattern = re.compile("[0-9]+\\.html") - article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + def __init__(self, options, log, progress_reporter): + BasicNewsRecipe.__init__(self, options, log, progress_reporter) + br = BasicNewsRecipe.get_browser(self) + second = time.time() + 24*60*60 + for i in range(7): + self.date = time.gmtime(second) + try: + br.open(time.strftime(self.date_url,self.date)) + break + except HTTPError: + second -= 24*60*60 + self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open(self.login_url) - br.select_form(nr=0) - br['login'] = self.username - br['password'] = self.password - br.submit() + br.open(self.login_url) + br.select_form(nr=0) + br['login'] = self.username + br['password'] = self.password + br.submit() return br - decalage = 24 * 60 * 60 # today Monde has tomorow date - def get_cover_url(self): - url = time.strftime(self.coverurl_format, self.ltime) - return self.articles_path + url + url = time.strftime(self.couverture_url,self.date) + return url def parse_index(self): - browser = self.get_browser() - - second = time.time() - second += self.decalage - - for i in range(7): - self.ltime = time.gmtime(second) - self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding) - url = time.strftime(self.zipurl_format,self.ltime) - try: - response = browser.open(url) - continue - except HTTPError: - second -= 24*60*60 - - tmp = PersistentTemporaryFile(suffix='.zip') - self.report_progress(0.1,_('downloading zip file')) - tmp.write(response.read()) - tmp.close() - - zfile = zipfile.ZipFile(tmp.name, 'r') - self.report_progress(0.1,_('extracting zip file')) - - zfile.extractall(self.output_dir) - zfile.close() - - path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data") - - self.articles_path = path - - files = os.listdir(path) - - nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ]) - - flux = [] - - article_url = time.strftime(self.article_url_format, self.ltime) - - for i in range(nb_index_files): - filename = os.path.join(path, "selection_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES) - title=soup.find('span').contents[0] - if title=="Une": - title="À la une" - if title=="Evenement": - title="L'événement" - if title=="Planete": - title="Planète" - if title=="Economie - Entreprises": - title="Économie" - if title=="L'Oeil du Monde": - title="L'œil du Monde" - if title=="Enquete": - title="Enquête" - if title=="Editorial - Analyses": - title="Analyses" - if title=="Le Monde Economie": - title="Économie" - if title=="Le Monde Culture et idées": - title="Idées" - if title=="Le Monde Géo et politique": - title="Géopolitique" - tmp.close() - - filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) - tmp = open(filename,'r') - soup = BeautifulSoup(tmp) + url = time.strftime(self.journal_url,self.date) + soup = self.index_to_soup(url).sommaire + sections = [] + for sec in soup.findAll("section"): articles = [] - for link in soup.findAll("a"): - article_file = link['href'] - article_id=self.article_id_pattern.search(article_file).group() - article = { - 'title': link.contents[0], - 'url': article_url + article_id, - 'description': '', - 'content': '' - } - articles.append(article) - tmp.close() + if sec['cahier'] != "Le Monde": + for col in sec.findAll("fnts"): + col.extract() + if sec['cahier']=="Le Monde Magazine": + continue + for art in sec.findAll("art"): + if art.txt.string and art.ttr.string: + if art.find(['url']): + art.insert(6,'
') + if art.find(['lgd']) and art.find(['lgd']).string: + art.insert(7,'
'+art.find(['lgd']).string+'
') + article = ""+unicode(art)+"" + article = article.replace('','').replace(' oC ','°C ') + article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') + f = PersistentTemporaryFile() + f.write(article) + articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) + sections.append((sec['nom'], articles)) + return sections - flux.append((title, articles)) + def preprocess_html(self, soup): + for lgd in soup.findAll(id="lgd"): + lgd.contents[-1].extract() + return soup - return flux - - - -# Local Variables: -# mode: python -# End: