New recipe for Courrier International and updated Le Monde by Mathieu Godlewski

2025-07-09 03:04:10 -04:00 · 2009-02-25 11:08:44 -08:00 · 2009-02-25 11:08:44 -08:00 · 18c39c6e20
commit 18c39c6e20
parent 0a493da7e8
3 changed files with 88 additions and 11 deletions
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -31,7 +31,7 @@ recipe_modules = ['recipe_' + r for r in (
           'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices',
           'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline',
           'la_republica', 'physics_today', 'chicago_tribune', 'e_novine',
-           'al_jazeera', 'winsupersite', 'borba',
+           'al_jazeera', 'winsupersite', 'borba', 'courrierinternational',
          )]
 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_courrierinternational.py
+++ b/src/calibre/web/feeds/recipes/recipe_courrierinternational.py
@ -0,0 +1,41 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>'
 '''
 Courrier International
 '''
 import re
 from datetime import date
 from calibre.web.feeds.news import BasicNewsRecipe
 class CourrierInternational(BasicNewsRecipe):
    title          = 'Courrier International'
    __author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>'
    description = 'Global news in french from international newspapers'
    oldest_article = 7
    language = _('French')
    max_articles_per_feed = 50
    no_stylesheets = True
    html2lrf_options = ['--base-font-size', '10']
    feeds =  [
        # Some articles requiring subscription fails on download.
        ('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'),
    ]
    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
        [
            #Handle Depeches
            (r'.*<td [^>]*>([0-9][0-9]/.*</p>)</td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</td></tr></table></body></html>'),
            #Handle Articles
            (r'.*<td [^>]*>(Courrier international.*?)							<td width="10"><img src="/img/espaceur.gif"></td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</body></html>'),
        ]
    ]
    def print_version(self, url):
        return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url)
--- a/src/calibre/web/feeds/recipes/recipe_le_monde.py
+++ b/src/calibre/web/feeds/recipes/recipe_le_monde.py
@ -7,7 +7,7 @@ lemonde.fr
 '''
 import re
-
+from datetime import date
 from calibre.web.feeds.news import BasicNewsRecipe
@ -15,10 +15,14 @@ class LeMonde(BasicNewsRecipe):
    title          = 'LeMonde.fr'
    __author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>'
    description = 'Global news in french'
-    oldest_article = 7
+    oldest_article = 3
    language = _('French')
-    max_articles_per_feed = 20
+    max_articles_per_feed = 30
    no_stylesheets = True
    cover_url='http://abonnes.lemonde.fr/titresdumonde/'+date.today().strftime("%y%m%d")+'/1.jpg'
    html2lrf_options = ['--base-font-size', '10']
    feeds =  [
             ('A la Une', 'http://www.lemonde.fr/rss/une.xml'),
@ -47,16 +51,48 @@ class LeMonde(BasicNewsRecipe):
    extra_css      = '.ar-tit {font-size: x-large;} \n .dt {font-size: x-small;}'
-    filter_regexps = [r'xiti\.com']
+    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
        [
            (r'<html.*(<div class="post".*?>.*?</div>.*?<div class="entry">.*?</div>).*You can start editing here.*</html>', lambda match : '<html><body>'+match.group(1)+'</body></html>'),
            (r'<p>&nbsp;</p>', lambda match : ''),
            (r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>'+match.group(1).upper()),
            (r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/q(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>"'+match.group(1).upper()),
            (r'(<div class=desc><b>.*</b></div>).*</body>', lambda match : match.group(1)),
        ]
    ]
-    def print_version(self, url):
+    article_match_regexps = [ (re.compile(i)) for i in
-        return re.sub('http:.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url)
+        [
            (r'http://www\.lemonde\.fr/\S+/article/.*'),
            (r'http://www\.lemonde\.fr/\S+/portfolio/.*'),
            (r'http://www\.lemonde\.fr/\S+/article_interactif/.*'),
            (r'http://\S+\.blog\.lemonde\.fr/.*'),
        ]
    ]
    def print_version(self, url):
        return re.sub('http://www\.lemonde\.fr/.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url)
    # Used to filter duplicated articles
    articles_list = []
    def get_article_url(self, article):
        url=article.get('link',  None)
        url=url[0:url.find("#")]
        if url in self.articles_list:
            self.log_debug(_('Skipping duplicated article: %s')%url)
            return False
        if self.is_article_wanted(url):
            self.articles_list.append(url)
            return url
        self.log_debug(_('Skipping filtered article: %s')%url)
        return False
    def is_article_wanted(self, url):
        if self.article_match_regexps:
            for m in self.article_match_regexps:
                if m.search(url):
                    return True
            return False
        return False