#!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import unicode_literals, division, absolute_import, print_function ''' capital.de ''' import re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1305470859(BasicNewsRecipe): title = 'Capital.de' __author__ = 'schuster' description = 'RSS-Feed von Capital.de' publisher = 'Gruner+Jahr GmbH & Co KG' language = 'de' oldest_article = 14 max_articles_per_feed = 35 no_stylesheets = True remove_javascript = True use_embedded_content = False conversion_options = {'smarten_punctuation': True, 'publisher': publisher} cover_source = 'http://shop.capital.de/abos/capital/' masthead_url = 'http://www.capital.de/files/capital/layout/logo.png' feeds = [ ('Capital.de', 'http://www.capital.de/partner-feeds/rss.xml') ] keep_only_tags = [ dict(name='div', attrs={ 'class': 'grid_8 alpha omega layout_full block'}) ] remove_tags = [ dict(name='div', attrs={'class': 'article_header'}), dict(name='br', attrs={'class': 'clear'}) ] remove_attributes = ['height', 'width'] extra_css = 'h1 {font-size: 1.6em; text-align: left} \ h2 {font-size: 1em; text-align: left} \ .copyright {font-size: 0.6em} \ .caption {font-size: 0.6em}' def get_cover_url(self): soup = self.index_to_soup(self.cover_source) img_span = soup.find('span', {'class': re.compile('coverimage')}) self.cover_url = img_span.find('img', src=True)['src'] return self.cover_url def preprocess_html(self, soup): # remove all articles without relevant content tags = soup.findAll('li', {'class': 'tag-chain-item'}) for li in tags: if 'BILDERSTRECKE' in self.tag_to_string(li).upper(): self.abort_article() # remove list of tags tags = soup.find('ul', {'class': 'tag-chain'}) if tags: tags.extract() # remove all style attributes for item in soup.findAll(style=True): del item['style'] # remove all local hyperlinks for a in soup.findAll('a', {'href': True}): if a['href'] and 'http' not in a['href']: del a['href'] # remove picture(s) of author(s) for div in soup.findAll('div', {'class': 'ce_text block'}): if div.find('hr'): for hr in div.findAll('hr'): hr.extract() for img in div.findAll('img'): img.extract() return soup