Update Le Monde (subscription version)

2025-07-09 03:04:10 -04:00 · 2013-02-13 10:09:18 +05:30 · 2013-02-13 10:09:18 +05:30 · 6aaee7085a
commit 6aaee7085a
parent 105fa1e779
1 changed files with 71 additions and 143 deletions
--- a/recipes/le_monde_sub.recipe
+++ b/recipes/le_monde_sub.recipe
@ -1,166 +1,94 @@
-#!/usr/bin/env  python
-
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+__author__    = 'Sylvain Durand <sylvain.durand@ponts.org>'
 __license__   = 'GPL v3'
-__copyright__ = '2012, 2013, Rémi Vanicat <vanicat at debian.org>'
-'''
-Lemonde.fr: Version abonnée
-'''

+import time

-import os, zipfile, re, time
-from urllib2 import HTTPError
-from calibre.constants import preferred_encoding
-
+from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.ptempfile import PersistentTemporaryFile
+from urllib2 import HTTPError

-class LeMondeAbonne(BasicNewsRecipe):
+class LeMonde(BasicNewsRecipe):

-    title                 = u'Le Monde: Édition abonnés'
-    __author__            = u'Rémi Vanicat'
-    description           = u'Actualités'
-    category              = u'Actualités, France, Monde'
-    publisher             = 'Le Monde'
-    language              = 'fr'
-    needs_subscription    = True
-    no_stylesheets        = True
-    smarten_punctuation   = True
-    remove_attributes     = [ 'border', 'cellspacing', 'display', 'align', 'cellpadding', 'colspan', 'valign', 'vscape', 'hspace', 'alt', 'width', 'height']
-    extra_css = ''' li{margin:6pt 0}
-                    ul{margin:0}
+    title              = u'Le Monde: Édition abonnés'
+    __author__         = 'Sylvain Durand'
+    description        = u'Disponible du lundi au samedi à partir de 14 heures environ, avec tous ses cahiers.'
+    language           = 'fr'
+    encoding           = 'utf8'

-                    div.photo img{max-width:100%; border:0px transparent solid;}
-                    div.photo{font-family:inherit; color:#333; text-align:center;}
-                    div.photo p{text-align:justify;font-size:.9em; line-height:.9em;}
+    needs_subscription = True

-                    @page{margin:10pt}
-                    .ar-txt {color:#000; text-align:justify;}
-                    h1{text-align:left; font-size:1.25em;}
+    date_url           = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html'
+    login_url          = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html'
+    journal_url        = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml'
+    masthead_url       = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/300px-Le_Monde_logo.svg.png'
+    couverture_url     = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg'

-                    .auteur{text-align:right; font-weight:bold}
-                    .feed{text-align:right; font-weight:bold}
-                    .po-ti2{font-weight:bold}
-                    .fen-tt{font-weight:bold;font-size:1.1em}
-    '''
+    extra_css = '''
+                img{max-width:100%}
+                h1{font-size:1.2em !important; line-height:1.2em !important; }
+                h2{font-size:1em !important; line-height:1em !important; }
+                h3{font-size:1em !important; text-transform:uppercase !important; color:#666;}
+                #photo{text-align:center !important; margin:10px 0 -8px;}
+                #lgd{font-size:1em !important; line-height:1em !important;  font-style:italic; color:#333;} '''

-    zipurl_format = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/%y%m%d.zip'
-    coverurl_format = '/img/%y%m%d01.jpg'
-    path_format = "%y%m%d"
-    login_url = 'http://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html'
+    keep_only_tags = [dict(name=['h1','h2','h3','div','txt'])]

-    keep_only_tags = [dict(name=['h1']), dict(name='div', attrs={ 'class': 'photo' }), dict(name='div', attrs={ 'class': 'po-ti2' }), dict(name='div', attrs={ 'class': 'ar-txt' }), dict(name='div', attrs={ 'class': 'po_rtcol' }) ]
-
-
-    remove_tags = [ dict(name='div', attrs={ 'class': 'po-ti' }),dict(name='div', attrs={ 'class': 'po-copy' })]
-
-    article_id_pattern = re.compile("[0-9]+\\.html")
-    article_url_format = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/html/'
+    def __init__(self, options, log, progress_reporter):
+        BasicNewsRecipe.__init__(self, options, log, progress_reporter)
+        br = BasicNewsRecipe.get_browser(self)
+        second = time.time() + 24*60*60
+        for i in range(7):
+            self.date = time.gmtime(second)
+            try:
+                br.open(time.strftime(self.date_url,self.date))
+                break
+            except HTTPError:
+                second -= 24*60*60
+        self.timefmt = strftime(u" %A %d %B %Y", self.date).replace(u' 0', u' ')

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
-        if self.username is not None and self.password is not None:
-            br.open(self.login_url)
-            br.select_form(nr=0)
-            br['login']    = self.username
-            br['password'] = self.password
-            br.submit()
+        br.open(self.login_url)
+        br.select_form(nr=0)
+        br['login']    = self.username
+        br['password'] = self.password
+        br.submit()
        return br

-    decalage = 24 * 60 * 60     # today Monde has tomorow date
-
    def get_cover_url(self):
-        url = time.strftime(self.coverurl_format, self.ltime)
-        return self.articles_path + url
+        url = time.strftime(self.couverture_url,self.date)
+        return url

    def parse_index(self):
-        browser = self.get_browser()
-
-        second = time.time()
-        second += self.decalage
-
-        for i in range(7):
-            self.ltime = time.gmtime(second)
-            self.timefmt=time.strftime(" %A %d %B %Y",self.ltime).decode(preferred_encoding)
-            url = time.strftime(self.zipurl_format,self.ltime)
-            try:
-                response = browser.open(url)
-                continue
-            except HTTPError:
-                second -= 24*60*60
-
-        tmp = PersistentTemporaryFile(suffix='.zip')
-        self.report_progress(0.1,_('downloading zip file'))
-        tmp.write(response.read())
-        tmp.close()
-
-        zfile = zipfile.ZipFile(tmp.name, 'r')
-        self.report_progress(0.1,_('extracting zip file'))
-
-        zfile.extractall(self.output_dir)
-        zfile.close()
-
-        path = os.path.join(self.output_dir, time.strftime(self.path_format, self.ltime), "data")
-
-        self.articles_path = path
-
-        files = os.listdir(path)
-
-        nb_index_files = len([ name for name in files if re.match("frame_gauche_[0-9]+.html", name) ])
-
-        flux = []
-
-        article_url = time.strftime(self.article_url_format, self.ltime)
-
-        for i in range(nb_index_files):
-            filename = os.path.join(path, "selection_%d.html" % (i + 1))
-            tmp = open(filename,'r')
-            soup=BeautifulSoup(tmp,convertEntities=BeautifulSoup.HTML_ENTITIES)
-            title=soup.find('span').contents[0]
-            if title=="Une":
-                title="À la une"
-            if title=="Evenement":
-                title="L'événement"
-            if title=="Planete":
-                title="Planète"
-            if title=="Economie - Entreprises":
-                title="Économie"
-            if title=="L'Oeil du Monde":
-                title="L'œil du Monde"
-            if title=="Enquete":
-                title="Enquête"
-            if title=="Editorial - Analyses":
-                title="Analyses"
-            if title=="Le Monde Economie":
-                title="Économie"
-            if title=="Le Monde Culture et idées":
-                title="Idées"
-            if title=="Le Monde Géo et politique":
-                title="Géopolitique"
-            tmp.close()
-
-            filename = os.path.join(path, "frame_gauche_%d.html" % (i + 1))
-            tmp = open(filename,'r')
-            soup = BeautifulSoup(tmp)
+        url = time.strftime(self.journal_url,self.date)
+        soup = self.index_to_soup(url).sommaire
+        sections = []
+        for sec in soup.findAll("section"):
            articles = []
-            for link in soup.findAll("a"):
-                article_file = link['href']
-                article_id=self.article_id_pattern.search(article_file).group()
-                article = {
-                    'title': link.contents[0],
-                    'url': article_url + article_id,
-                    'description': '',
-                    'content': ''
-                    }
-                articles.append(article)
-            tmp.close()
+            if sec['cahier'] != "Le Monde":
+                for col in sec.findAll("fnts"):
+                    col.extract()
+            if sec['cahier']=="Le Monde Magazine":
+                continue
+            for art in sec.findAll("art"):
+                if art.txt.string and art.ttr.string:
+                    if art.find(['url']):
+                        art.insert(6,'<div id="photo"><img src="'+art.find(['url']).string+'" /></div>')
+                    if art.find(['lgd']) and art.find(['lgd']).string:
+                        art.insert(7,'<div id="lgd">'+art.find(['lgd']).string+'</div>')
+                    article = "<html><head></head><body>"+unicode(art)+"</body></html>"
+                    article = article.replace('<![CDATA[','').replace(']]>','').replace(' oC ','°C ')
+                    article = article.replace('srttr>','h3>').replace('ssttr>','h2>').replace('ttr>','h1>')
+                    f = PersistentTemporaryFile()
+                    f.write(article)
+                    articles.append({'title':art.ttr.string,'url':"file:///"+f.name})
+            sections.append((sec['nom'], articles))
+        return sections

-            flux.append((title, articles))
+    def preprocess_html(self, soup):
+        for lgd in soup.findAll(id="lgd"):
+            lgd.contents[-1].extract()
+        return soup

-        return flux
-
-
-
-# Local Variables:
-# mode: python
-# End: