diff --git a/recipes/mediapart.recipe b/recipes/mediapart.recipe deleted file mode 100644 index a8729500ef..0000000000 --- a/recipes/mediapart.recipe +++ /dev/null @@ -1,161 +0,0 @@ -# -*- mode:python -*- -from __future__ import unicode_literals - -__license__ = 'GPL v3' -__copyright__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' -''' -Mediapart -''' - -__author__ = '2009, Mathieu Godlewski ; 2010-2012, Louis Gesbert ' - -import re -from calibre.ebooks.BeautifulSoup import BeautifulSoup -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.web.feeds import feeds_from_index -from datetime import date,timedelta - -class Mediapart(BasicNewsRecipe): - title = 'Mediapart' - __author__ = 'Mathieu Godlewski, Louis Gesbert' - description = 'Global news in french from news site Mediapart' - publication_type = 'newspaper' - language = 'fr' - needs_subscription = True - oldest_article = 2 - - use_embedded_content = False - no_stylesheets = True - - cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png' - -# -- - - oldest_article_date = date.today() - timedelta(days=oldest_article) - -# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has -# the 10 last elements :/) - - feeds = [ - ('La Une', 'http://www.mediapart.fr/articles/feed'), - ] - - def parse_feeds(self): - feeds = super(Mediapart, self).parse_feeds() - feeds += feeds_from_index(self.my_parse_index(feeds)) - return feeds - - def my_parse_index(self, la_une): - articles = [] - - breves = [] - liens = [] - confidentiels = [] - - soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites') - page = soup.find('div', {'id':'pageFirstContent'}) - fils = page.find('div', {'class':re.compile(r'\bcontent-journal\b')}) - - for article in fils.findAll('div'): - try: - title = article.find('h2',recursive=False) - if title is None or title['class'] == 'title-specific': - continue - - # print "found fil ",title - article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents() - # print "kind: ",article_type - - for s in title('span'): - s.replaceWith(s.renderContents() + "\n") - url = title.find('a', href=True)['href'] - - article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) - - if article_date < self.oldest_article_date: - # print "too old" - continue - - authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')}) - authors = [self.tag_to_string(a) for a in authors] - - description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p') - - # print "fil ",title," by ",authors," : ",description - - summary = { - 'title': self.tag_to_string(title).strip(), - 'author': ', '.join(authors), - 'url': url, - 'date': u'' + article_date.strftime("%A %d %b %Y"), - 'description': '\n'.join([self.tag_to_string(d) for d in description]), - } - { - "Brève": breves, - "Lien": liens, - "Confidentiel": confidentiels, - }.get(article_type).append(summary) - except: - pass - - # print 'La Une: ', len(la_une), ' articles' - # for a in la_une: print a["title"] - # print 'Brèves: ', len(breves), ' articles' - # print 'Revue web: ', len(liens), ' articles' - # print 'Confidentiel: ', len(confidentiels), ' articles' - - articles += [('Brèves', breves)] if breves else [] - articles += [('Revue du Web', liens)] if liens else [] - articles += [('Confidentiel', confidentiels)] if confidentiels else [] - return articles - -# -- print-version - - conversion_options = {'smarten_punctuation' : True} - - remove_tags = [dict(name='div', attrs={'class':'print-source_url'})] - - # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale) - def parse_french_date(self, date_str): - date_arr = date_str.lower().split() - return date(day=int(date_arr[0]), - year=int(date_arr[2]), - month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', - 'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) - - def print_version(self, url): - raw = self.browser.open(url).read() - soup = BeautifulSoup(raw.decode('utf8', 'replace')) - - # Filter old articles - article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date'))) - - if article_date < self.oldest_article_date: - return None - - tools = soup.find('div', {'class':'menu-tools'}) - link = tools.find('a', {'href': re.compile(r'\/print\/.*')}) - if link is None: - print 'Error: print link not found' - return None - return 'https://mediapart.fr/' + link['href'] - -# -- Handle login - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if self.username is not None and self.password is not None: - br.open('https://www.mediapart.fr/login') - br.select_form(nr=1) - br['name'] = self.username - br['password'] = self.password - br.submit() - return br - - # This is a workaround articles with scribd content that include - # tags _within_ the body - preprocess_regexps = [ - (re.compile(r'()(.*)', re.IGNORECASE|re.DOTALL), - lambda match: - match.group(1) + re.sub( - re.compile(r'', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '') - ]