mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Fix Mediapart
This commit is contained in:
		
							parent
							
								
									6df4b8ff39
								
							
						
					
					
						commit
						c7f706348c
					
				@ -1,16 +1,17 @@
 | 
			
		||||
__license__   = 'GPL v3'
 | 
			
		||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, Louis Gesbert <meta at antislash dot info>'
 | 
			
		||||
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, 2011, Louis Gesbert <meta at antislash dot info>'
 | 
			
		||||
'''
 | 
			
		||||
Mediapart
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
from calibre.ebooks.BeautifulSoup import Tag
 | 
			
		||||
import re
 | 
			
		||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | 
			
		||||
from calibre.web.feeds.news import BasicNewsRecipe
 | 
			
		||||
 | 
			
		||||
class Mediapart(BasicNewsRecipe):
 | 
			
		||||
    title          = 'Mediapart'
 | 
			
		||||
    __author__ = 'Mathieu Godlewski'
 | 
			
		||||
    description = 'Global news in french from online newspapers'
 | 
			
		||||
    __author__ = 'Mathieu Godlewski, Louis Gesbert'
 | 
			
		||||
    description = 'Global news in french from news site Mediapart'
 | 
			
		||||
    oldest_article = 7
 | 
			
		||||
    language = 'fr'
 | 
			
		||||
    needs_subscription = True
 | 
			
		||||
@ -18,52 +19,30 @@ class Mediapart(BasicNewsRecipe):
 | 
			
		||||
    max_articles_per_feed = 50
 | 
			
		||||
    no_stylesheets = True
 | 
			
		||||
 | 
			
		||||
    cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg'
 | 
			
		||||
    cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'
 | 
			
		||||
 | 
			
		||||
    feeds =  [
 | 
			
		||||
        ('Les articles', 'http://www.mediapart.fr/articles/feed'),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
# -- print-version has poor quality on this website, better do the conversion ourselves
 | 
			
		||||
#
 | 
			
		||||
#     preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
 | 
			
		||||
#         [
 | 
			
		||||
#             (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
 | 
			
		||||
#             (r'<span class=\'auteur_staff\'>[^>]+<a title=\'[^\']*\'[^>]*>([^<]*)</a>[^<]*</span>',
 | 
			
		||||
#              lambda match : '<i>'+match.group(1)+'</i>'),
 | 
			
		||||
#             (r'\'', lambda match: '’'),
 | 
			
		||||
#         ]
 | 
			
		||||
#      ]
 | 
			
		||||
#
 | 
			
		||||
#     remove_tags    = [ dict(name='div', attrs={'class':'print-source_url'}),
 | 
			
		||||
#                        dict(name='div', attrs={'class':'print-links'}),
 | 
			
		||||
#                        dict(name='img', attrs={'src':'entete_article.png'}),
 | 
			
		||||
#                        dict(name='br') ]
 | 
			
		||||
#
 | 
			
		||||
#     def print_version(self, url):
 | 
			
		||||
#         raw = self.browser.open(url).read()
 | 
			
		||||
#         soup = BeautifulSoup(raw.decode('utf8', 'replace'))
 | 
			
		||||
#         div = soup.find('div', {'id':re.compile('node-\d+')})
 | 
			
		||||
#         if div is None:
 | 
			
		||||
#             return None
 | 
			
		||||
#         article_id = string.replace(div['id'], 'node-', '')
 | 
			
		||||
#         if article_id is None:
 | 
			
		||||
#             return None
 | 
			
		||||
#         return 'http://www.mediapart.fr/print/'+article_id
 | 
			
		||||
# -- print-version
 | 
			
		||||
 | 
			
		||||
# -- Non-print version [dict(name='div', attrs={'class':'advert'})]
 | 
			
		||||
 | 
			
		||||
    keep_only_tags = [
 | 
			
		||||
        dict(name='h1', attrs={'class':'title'}),
 | 
			
		||||
        dict(name='div', attrs={'class':'page_papier_detail'}),
 | 
			
		||||
    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
 | 
			
		||||
        [
 | 
			
		||||
            (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
 | 
			
		||||
            (r'\'', lambda match: '’')
 | 
			
		||||
        ]
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    def preprocess_html(self,soup):
 | 
			
		||||
        for title in soup.findAll('div', {'class':'titre'}):
 | 
			
		||||
            tag = Tag(soup, 'h3')
 | 
			
		||||
            title.replaceWith(tag)
 | 
			
		||||
            tag.insert(0,title)
 | 
			
		||||
        return soup
 | 
			
		||||
    remove_tags    = [ dict(name='div', attrs={'class':'print-source_url'}) ]
 | 
			
		||||
 | 
			
		||||
    def print_version(self, url):
 | 
			
		||||
        raw = self.browser.open(url).read()
 | 
			
		||||
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
 | 
			
		||||
        link = soup.find('a', {'title':'Imprimer'})
 | 
			
		||||
        if link is None:
 | 
			
		||||
            return None
 | 
			
		||||
        return link['href']
 | 
			
		||||
 | 
			
		||||
# -- Handle login
 | 
			
		||||
 | 
			
		||||
@ -76,4 +55,3 @@ class Mediapart(BasicNewsRecipe):
 | 
			
		||||
            br['pass'] = self.password
 | 
			
		||||
            br.submit()
 | 
			
		||||
        return br
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user