mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Le Monde
This commit is contained in:
parent
525510e53d
commit
312e6388c1
@ -42,6 +42,8 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for lgd in soup.findAll(id="lgd"):
|
for lgd in soup.findAll(id="lgd"):
|
||||||
lgd.contents[-1].extract()
|
lgd.contents[-1].extract()
|
||||||
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||||
|
img['src'] = img['data-src']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
|
@ -2,13 +2,7 @@
|
|||||||
__author__ = 'S. Durand <sylvaindurand@users.noreply.github.com>'
|
__author__ = 'S. Durand <sylvaindurand@users.noreply.github.com>'
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
|
|
||||||
from calibre import strftime
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
|
||||||
from urllib2 import HTTPError
|
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -27,11 +21,7 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
|
|
||||||
needs_subscription = True
|
needs_subscription = True
|
||||||
|
|
||||||
date_url = 'http://www.lemonde.fr/journalelectronique/donnees/libre/%Y%m%d/index.html'
|
|
||||||
login_url = 'https://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html'
|
login_url = 'https://www.lemonde.fr/web/journal_electronique/identification/1,56-0,45-0,0.html'
|
||||||
journal_url = 'http://www.lemonde.fr/journalelectronique/donnees/protege/%Y%m%d/%Y%m%d_ipad.xml'
|
|
||||||
masthead_url = 'http://upload.wikimedia.org/wikipedia/fr/thumb/c/c5/Le_Monde_logo.svg/800px-Le_Monde_logo.svg.png'
|
|
||||||
couverture_url = 'http://medias.lemonde.fr/abonnes/editionelectronique/%Y%m%d/html/data/img/%y%m%d01.jpg'
|
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
img{max-width:100%}
|
img{max-width:100%}
|
||||||
@ -47,19 +37,19 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
dict(itemprop=['articleBody']),
|
dict(itemprop=['articleBody']),
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, options, log, progress_reporter):
|
lm_sections = [
|
||||||
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
|
'international:International',
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
'politique:Politique',
|
||||||
second = time.time() + 24 * 60 * 60
|
'societe:Société',
|
||||||
for i in range(7):
|
'economie:Éco',
|
||||||
self.date = time.gmtime(second)
|
'culture:Culture',
|
||||||
try:
|
'idees:Idées',
|
||||||
br.open(time.strftime(self.date_url, self.date))
|
'planete:Planète',
|
||||||
break
|
'sport:Sport',
|
||||||
except HTTPError:
|
'sciences:Sciences',
|
||||||
second -= 24 * 60 * 60
|
'pixels:Pixels',
|
||||||
self.timefmt = strftime(
|
'campus:Campus'
|
||||||
u" %A %d %B %Y", self.date).replace(u' 0', u' ')
|
]
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
br = BasicNewsRecipe.get_browser(self)
|
br = BasicNewsRecipe.get_browser(self)
|
||||||
@ -71,55 +61,55 @@ class LeMonde(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
url = time.strftime(self.couverture_url, self.date)
|
cover_url = None
|
||||||
return url
|
soup = self.index_to_soup(
|
||||||
|
'http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html')
|
||||||
|
link_item = soup.find('div', attrs={'class': 'pg-gch'})
|
||||||
|
|
||||||
def parse_index(self):
|
if link_item and link_item.img:
|
||||||
url = time.strftime(self.journal_url, self.date)
|
cover_url = link_item.img['src']
|
||||||
soup = self.index_to_soup(url).sommaire
|
|
||||||
sections = []
|
|
||||||
try:
|
|
||||||
for sec in soup.findAll("section"):
|
|
||||||
articles = []
|
|
||||||
if sec['cahier'] != "Le Monde":
|
|
||||||
for col in sec.findAll("fnts"):
|
|
||||||
col.extract()
|
|
||||||
if sec['cahier'] == "Le Monde Magazine":
|
|
||||||
continue
|
|
||||||
for art in sec.findAll("art"):
|
|
||||||
if art.txt.string and art.ttr.string:
|
|
||||||
if art.find(['url']):
|
|
||||||
art.insert(6, '<div id="photo"><img src="' +
|
|
||||||
art.find(['url']).string + '" /></div>')
|
|
||||||
if art.find(['lgd']) and art.find(['lgd']).string:
|
|
||||||
art.insert(7, '<div id="lgd">' +
|
|
||||||
art.find(['lgd']).string + '</div>')
|
|
||||||
|
|
||||||
def guillemets(match):
|
return cover_url
|
||||||
if match.group(1) == u"=":
|
|
||||||
return match.group(0)
|
|
||||||
return u'%s« %s »' % (match.group(1), match.group(2))
|
|
||||||
|
|
||||||
article = "<html><head></head><body>" + \
|
|
||||||
unicode(art) + "</body></html>"
|
|
||||||
article = article.replace(
|
|
||||||
'<![CDATA[', '').replace(']]>', '').replace(' oC ', '°C ')
|
|
||||||
article = article.replace('srttr>', 'h3>').replace(
|
|
||||||
'ssttr>', 'h2>').replace('ttr>', 'h1>')
|
|
||||||
article = article.replace("'", u'\u2019')
|
|
||||||
article = re.sub('(.|^)"([^"]+)"', guillemets, article)
|
|
||||||
|
|
||||||
f = PersistentTemporaryFile()
|
|
||||||
f.write(article)
|
|
||||||
articles.append(
|
|
||||||
{'title': art.ttr.string, 'url': "file:///" + f.name})
|
|
||||||
sections.append((sec['nom'], articles))
|
|
||||||
except AttributeError:
|
|
||||||
self.log(
|
|
||||||
"Vos identifiants sont incorrects, ou votre abonnement LeMonde.fr ne vous permet pas de télécharger le journal.")
|
|
||||||
return sections
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for lgd in soup.findAll(id="lgd"):
|
for lgd in soup.findAll(id="lgd"):
|
||||||
lgd.contents[-1].extract()
|
lgd.contents[-1].extract()
|
||||||
|
for img in soup.findAll('img', attrs={'data-src': True}):
|
||||||
|
img['src'] = img['data-src']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
ans = []
|
||||||
|
for x in self.lm_sections:
|
||||||
|
s, section_title = x.partition(':')[::2]
|
||||||
|
self.log('Processing section', section_title, '...')
|
||||||
|
articles = list(self.parse_section('http://www.lemonde.fr/%s/' % s))
|
||||||
|
if articles:
|
||||||
|
ans.append((section_title, articles))
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def parse_section(self, url):
|
||||||
|
soup = self.index_to_soup(url)
|
||||||
|
container = soup.find(attrs={'class':lambda x: x and 'grid_12 alpha' in x})
|
||||||
|
for article in container.findAll('article'):
|
||||||
|
h2 = article.find('h2')
|
||||||
|
if h2 is None:
|
||||||
|
h2 = article.find('h3')
|
||||||
|
if h2 is None:
|
||||||
|
continue
|
||||||
|
a = h2.find('a', href=True)
|
||||||
|
if a is None:
|
||||||
|
a = h2.findParents('a', href=True)
|
||||||
|
if not a:
|
||||||
|
continue
|
||||||
|
a = a[0]
|
||||||
|
url = a['href']
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'http://www.lemonde.fr' + url
|
||||||
|
title = self.tag_to_string(a)
|
||||||
|
desc = ''
|
||||||
|
p = article.find('p')
|
||||||
|
if p is not None:
|
||||||
|
desc = self.tag_to_string(p)
|
||||||
|
self.log('\tFound article', title, 'at', url)
|
||||||
|
yield {'title': title, 'url': url, 'description': desc}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user