mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for Courrier International and updated Le Monde by Mathieu Godlewski
This commit is contained in:
parent
0a493da7e8
commit
18c39c6e20
@ -31,7 +31,7 @@ recipe_modules = ['recipe_' + r for r in (
|
|||||||
'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices',
|
'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices',
|
||||||
'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline',
|
'hindu', 'cincinnati_enquirer', 'physics_world', 'pressonline',
|
||||||
'la_republica', 'physics_today', 'chicago_tribune', 'e_novine',
|
'la_republica', 'physics_today', 'chicago_tribune', 'e_novine',
|
||||||
'al_jazeera', 'winsupersite', 'borba',
|
'al_jazeera', 'winsupersite', 'borba', 'courrierinternational',
|
||||||
)]
|
)]
|
||||||
|
|
||||||
import re, imp, inspect, time, os
|
import re, imp, inspect, time, os
|
||||||
|
@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>'
|
||||||
|
'''
|
||||||
|
Courrier International
|
||||||
|
'''
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import date
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class CourrierInternational(BasicNewsRecipe):
|
||||||
|
title = 'Courrier International'
|
||||||
|
__author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>'
|
||||||
|
description = 'Global news in french from international newspapers'
|
||||||
|
oldest_article = 7
|
||||||
|
language = _('French')
|
||||||
|
max_articles_per_feed = 50
|
||||||
|
no_stylesheets = True
|
||||||
|
|
||||||
|
html2lrf_options = ['--base-font-size', '10']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
# Some articles requiring subscription fails on download.
|
||||||
|
('A la Une', 'http://www.courrierinternational.com/rss/rss_a_la_une.xml'),
|
||||||
|
]
|
||||||
|
|
||||||
|
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
|
||||||
|
[
|
||||||
|
#Handle Depeches
|
||||||
|
(r'.*<td [^>]*>([0-9][0-9]/.*</p>)</td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</td></tr></table></body></html>'),
|
||||||
|
#Handle Articles
|
||||||
|
(r'.*<td [^>]*>(Courrier international.*?) <td width="10"><img src="/img/espaceur.gif"></td>.*', lambda match : '<html><body><table><tr><td>'+match.group(1)+'</body></html>'),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return re.sub('/[a-zA-Z]+\.asp','/imprimer.asp' ,url)
|
||||||
|
|
@ -7,7 +7,7 @@ lemonde.fr
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from datetime import date
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
@ -15,10 +15,14 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
title = 'LeMonde.fr'
|
title = 'LeMonde.fr'
|
||||||
__author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>'
|
__author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>'
|
||||||
description = 'Global news in french'
|
description = 'Global news in french'
|
||||||
oldest_article = 7
|
oldest_article = 3
|
||||||
language = _('French')
|
language = _('French')
|
||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 30
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
cover_url='http://abonnes.lemonde.fr/titresdumonde/'+date.today().strftime("%y%m%d")+'/1.jpg'
|
||||||
|
|
||||||
|
|
||||||
|
html2lrf_options = ['--base-font-size', '10']
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('A la Une', 'http://www.lemonde.fr/rss/une.xml'),
|
('A la Une', 'http://www.lemonde.fr/rss/une.xml'),
|
||||||
@ -47,16 +51,48 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
|
|
||||||
extra_css = '.ar-tit {font-size: x-large;} \n .dt {font-size: x-small;}'
|
extra_css = '.ar-tit {font-size: x-large;} \n .dt {font-size: x-small;}'
|
||||||
|
|
||||||
filter_regexps = [r'xiti\.com']
|
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
|
||||||
|
|
||||||
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
|
|
||||||
[
|
[
|
||||||
|
(r'<html.*(<div class="post".*?>.*?</div>.*?<div class="entry">.*?</div>).*You can start editing here.*</html>', lambda match : '<html><body>'+match.group(1)+'</body></html>'),
|
||||||
(r'<p> </p>', lambda match : ''),
|
(r'<p> </p>', lambda match : ''),
|
||||||
(r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>'+match.group(1).upper()),
|
(r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>'+match.group(1).upper()),
|
||||||
|
(r'<img src="http://medias\.lemonde\.fr/mmpub/img/let/q(.)\.gif"[^>]*><div class=ar-txt>', lambda match : '<div class=ar-txt>"'+match.group(1).upper()),
|
||||||
(r'(<div class=desc><b>.*</b></div>).*</body>', lambda match : match.group(1)),
|
(r'(<div class=desc><b>.*</b></div>).*</body>', lambda match : match.group(1)),
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
|
||||||
def print_version(self, url):
|
article_match_regexps = [ (re.compile(i)) for i in
|
||||||
return re.sub('http:.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url)
|
[
|
||||||
|
(r'http://www\.lemonde\.fr/\S+/article/.*'),
|
||||||
|
(r'http://www\.lemonde\.fr/\S+/portfolio/.*'),
|
||||||
|
(r'http://www\.lemonde\.fr/\S+/article_interactif/.*'),
|
||||||
|
(r'http://\S+\.blog\.lemonde\.fr/.*'),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
def print_version(self, url):
|
||||||
|
return re.sub('http://www\.lemonde\.fr/.*_([0-9]+)_[0-9]+\.html.*','http://www.lemonde.fr/web/imprimer_element/0,40-0,50-\\1,0.html' ,url)
|
||||||
|
|
||||||
|
# Used to filter duplicated articles
|
||||||
|
articles_list = []
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
url=article.get('link', None)
|
||||||
|
url=url[0:url.find("#")]
|
||||||
|
if url in self.articles_list:
|
||||||
|
self.log_debug(_('Skipping duplicated article: %s')%url)
|
||||||
|
return False
|
||||||
|
if self.is_article_wanted(url):
|
||||||
|
self.articles_list.append(url)
|
||||||
|
return url
|
||||||
|
self.log_debug(_('Skipping filtered article: %s')%url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_article_wanted(self, url):
|
||||||
|
if self.article_match_regexps:
|
||||||
|
for m in self.article_match_regexps:
|
||||||
|
if m.search(url):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
Loading…
x
Reference in New Issue
Block a user