diff --git a/recipes/le_monde_sub_paper.recipe b/recipes/le_monde_sub_paper.recipe new file mode 100644 index 0000000000..a1c621ef52 --- /dev/null +++ b/recipes/le_monde_sub_paper.recipe @@ -0,0 +1,175 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2012, 2013, Rémi Vanicat ' +''' +Lemonde.fr: Version abonnée +''' + +import os, zipfile, re, time +from urllib2 import HTTPError +from calibre.constants import preferred_encoding + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup +from calibre.ptempfile import PersistentTemporaryFile + + +class LeMondeAbonne(BasicNewsRecipe): + + title = u'Le Monde: Édition abonnés papier' + __author__ = u'Rémi Vanicat' + description = u'Actualités' + category = u'Actualités, France, Monde' + publisher = 'Le Monde' + language = 'fr' + needs_subscription = True + no_stylesheets = True + smarten_punctuation = True + remove_attributes = [ + 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', + 'valign', 'vscape', 'hspace', 'alt', 'width', 'height' + ] + extra_css = ''' li{margin:6pt 0} + ul{margin:0} + + div.photo img{max-width:100%; border:0px transparent solid;} + div.photo{font-family:inherit; color:#333; text-align:center;} + div.photo p{text-align:justify;font-size:.9em; line-height:.9em;} + + @page{margin:10pt} + .ar-txt {color:#000; text-align:justify;} + h1{text-align:left; font-size:1.25em;} + + .auteur{text-align:right; font-weight:bold} + .feed{text-align:right; font-weight:bold} + .po-ti2{font-weight:bold} + .fen-tt{font-weight:bold;font-size:1.1em} + ''' + + zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip' + coverurl_format = '/img/%y%m%d01.jpg' + masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/5/54/Le_monde_logo.svg/800px-Le_monde_logo.svg.png' + path_format = "%y%m%d" + + keep_only_tags = [ + dict(name=['h1']), + dict(name='div', attrs={'class': 'photo'}), + dict(name='div', attrs={'class': 'po-ti2'}), + dict(name='div', attrs={'class': 'ar-txt'}), + dict(name='div', attrs={'class': 'po_rtcol'}) + ] + + remove_tags = [ + dict(name='div', attrs={'class': 'po-ti'}), + dict(name='div', attrs={'class': 'po-copy'}) + ] + + article_id_pattern = re.compile("[0-9]+\\.html") + article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/' + + def get_browser(self): + br = BasicNewsRecipe.get_browser(self) + br.open('https://secure.lemonde.fr/sfuser/connexion') + br.select_form(name='connection') + br['connection[mail]'] = self.username + br['connection[password]'] = self.password + br.submit() + return br + + decalage = 24 * 60 * 60 # today Monde has tomorow date + + def get_cover_url(self): + url = time.strftime(self.coverurl_format, self.ltime) + return self.articles_path + url + + def parse_index(self): + browser = self.get_browser() + + second = time.time() + second += self.decalage + + for i in range(7): + self.ltime = time.gmtime(second) + self.timefmt = time.strftime(" %A %d %B %Y", + self.ltime).decode(preferred_encoding) + url = time.strftime(self.zipurl_format, self.ltime) + try: + response = browser.open(url) + continue + except HTTPError: + second -= 24 * 60 * 60 + + tmp = PersistentTemporaryFile(suffix='.zip') + self.report_progress(0.1, _('downloading zip file')) + tmp.write(response.read()) + tmp.close() + + zfile = zipfile.ZipFile(tmp.name, 'r') + self.report_progress(0.1, _('extracting zip file')) + + zfile.extractall(self.output_dir) + zfile.close() + + path = os.path.join( + self.output_dir, time.strftime(self.path_format, self.ltime), "data" + ) + + self.articles_path = path + + files = os.listdir(path) + + nb_index_files = len([ + name for name in files if re.match("frame_gauche_[0-9]+.html", name) + ]) + + flux = [] + + article_url = time.strftime(self.article_url_format, self.ltime) + + for i in range(nb_index_files): + filename = os.path.join(path, "selection_%d.html" % (i + 1)) + tmp = open(filename, 'r') + soup = BeautifulSoup(tmp, convertEntities=BeautifulSoup.HTML_ENTITIES) + title = soup.find('span').contents[0] or 'Unknown' + if title == "Une": + title = "À la une" + if title == "Evenement": + title = "L'événement" + if title == "Planete": + title = "Planète" + if title == "Economie - Entreprises": + title = "Économie" + if title == "L'Oeil du Monde": + title = "L'œil du Monde" + if title == "Enquete": + title = "Enquête" + if title == "Editorial - Analyses": + title = "Analyses" + if title == "Le Monde Economie": + title = "Économie" + if title == "Le Monde Culture et idées": + title = "Idées" + if title == "Le Monde Géo et politique": + title = "Géopolitique" + tmp.close() + + filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1)) + tmp = open(filename, 'r') + soup = BeautifulSoup(tmp) + articles = [] + for link in soup.findAll("a"): + article_file = link['href'] + article_id = self.article_id_pattern.search(article_file).group() + article = { + 'title': link.contents[0], + 'url': article_url + article_id, + 'description': '', + 'content': '' + } + articles.append(article) + tmp.close() + + flux.append((title, articles)) + + return flux