# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai __author__ = 'Sylvain Durand ' __license__ = 'GPL v3' import time, re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile from urllib2 import HTTPError class LeMonde(BasicNewsRecipe): title = u'Le Monde: Édition abonnés' __author__ = 'Sylvain Durand' description = u'La version papier du quotidien Le Monde, disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.' language = 'fr' encoding = 'utf8' needs_subscription = True date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html' login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html' journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml' masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/800px-Le_Monde_logo.svg.png' couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg' extra_css = ''' img{max-width:100%} h1{font-size:1.2em !important; line-height:1.2em !important; } h2{font-size:1em !important; line-height:1em !important; } h3{font-size:1em !important; text-transform:uppercase !important; color:#666;} #photo{text-align:center !important; margin:10px 0 -8px;} #lgd{font-size:1em !important; line-height:1em !important; font-style:italic; color:#333;} ''' keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])] def __init__(self, options, log, progress_reporter): BasicNewsRecipe.__init__(self, options, log, progress_reporter) br = BasicNewsRecipe.get_browser(self) second = time.time() + 24*60*60 for i in range(7): self.date = time.gmtime(second) try: br.open(time.strftime(self.date_url,self.date)) break except HTTPError: second -= 24*60*60 self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ') def get_browser(self): br = BasicNewsRecipe.get_browser(self) br.open(self.login_url) br.select_form(nr=0) br['login'] = self.username br['password'] = self.password br.submit() return br def get_cover_url(self): url = time.strftime(self.couverture_url,self.date) return url def parse_index(self): url = time.strftime(self.journal_url,self.date) soup = self.index_to_soup(url).sommaire sections = [] try: for sec in soup.findAll("section"): articles = [] if sec['cahier'] != "Le Monde": for col in sec.findAll("fnts"): col.extract() if sec['cahier']=="Le Monde Magazine": continue for art in sec.findAll("art"): if art.txt.string and art.ttr.string: if art.find(['url']): art.insert(6,'
') if art.find(['lgd']) and art.find(['lgd']).string: art.insert(7,'
'+art.find(['lgd']).string+'
') def guillemets(match): if match.group(1) == u"=": return match.group(0) return u'%s« %s »' % (match.group(1), match.group(2)) article = ""+unicode(art)+"" article = article.replace('','').replace(' oC ','°C ') article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>') article = article.replace("'" , u'\u2019') article = re.sub('(.|^)"([^"]+)"', guillemets, article) f = PersistentTemporaryFile() f.write(article) articles.append({'title':art.ttr.string,'url':"file:///"+f.name}) sections.append((sec['nom'], articles)) except AttributeError: self.log("Vos identifiants sont incorrects, ou votre abonnement LeMonde.fr ne vous permet pas de télécharger le journal.") return sections def preprocess_html(self, soup): for lgd in soup.findAll(id="lgd"): lgd.contents[-1].extract() return soup