From 6b967c1f064be5caf892dfc9fc88cc9058197936 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 1 May 2016 14:28:22 +0530 Subject: [PATCH] Update Capital.de --- recipes/capital_de.recipe | 114 ++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 47 deletions(-) diff --git a/recipes/capital_de.recipe b/recipes/capital_de.recipe index 6826049bc9..3fe4b4cc66 100644 --- a/recipes/capital_de.recipe +++ b/recipes/capital_de.recipe @@ -1,61 +1,81 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function + +''' +capital.de +''' + +import re from calibre.web.feeds.news import BasicNewsRecipe + class AdvancedUserRecipe1305470859(BasicNewsRecipe): - title = u'Capital.de' - language = 'de' - __author__ = 'schuster' - oldest_article =7 + title = 'Capital.de' + __author__ = 'schuster' + description = 'RSS-Feed von Capital.de' + publisher = 'Gruner+Jahr GmbH & Co KG' + language = 'de' + + oldest_article = 14 max_articles_per_feed = 35 no_stylesheets = True remove_javascript = True use_embedded_content = False - masthead_url = 'http://www.wirtschaftsmedien-shop.de/media/stores/wirtschaftsmedien/capital/teaser_large_abo.jpg' - cover_url = 'http://d1kb9jvg6ylufe.cloudfront.net/WebsiteCMS/de/unternehmen/linktipps/mainColumn/08/image/DE_Capital_bis20mm_SW.jpg' - extra_css = ''' - h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} - h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} - img {min-width:300px; max-width:600px; min-height:300px; max-height:800px} - p{font-family:Arial,Helvetica,sans-serif;font-size:small;} - body{font-family:Helvetica,Arial,sans-serif;font-size:small;} - ''' - def print_version(self, url): - return url.replace ('nv=rss#utm_source=rss2&utm_medium=rss_feed&utm_campaign=/', 'mode=print') - remove_tags_bevor = [dict(name='td', attrs={'class':'textcell'})] - remove_tags_after = [dict(name='div', attrs={'class':'artikelsplit'})] + conversion_options = {'smarten_punctuation' : True, + 'publisher' : publisher} - feeds = [ (u'Wirtschaftsmagazin', u'http://www.capital.de/rss/'), - (u'Unternehmen', u'http://www.capital.de/rss/unternehmen'), - (u'Finanz & Geldanlage', u'http://www.capital.de/rss/finanzen/geldanlage')] + cover_source = 'http://shop.capital.de/abos/capital/' + masthead_url = 'http://www.capital.de/files/capital/layout/logo.png' - def append_page(self, soup, appendtag, position): - pager = soup.find('div',attrs={'class':'artikelsplit'}) - if pager: - nexturl = self.INDEX + pager.a['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class':'printable'}) - for it in texttag.findAll(style=True): - del it['style'] - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - appendtag.insert(position,texttag) + feeds = [ + ('Capital.de', 'http://www.capital.de/partner-feeds/rss.xml') + ] + keep_only_tags = [ + dict(name='div', attrs={'class':'grid_8 alpha omega layout_full block'}) + ] + + remove_tags = [ + dict(name='div', attrs={'class':'article_header'}), + dict(name='br', attrs={'class':'clear'}) + ] + + remove_attributes = ['height', 'width'] + + extra_css = 'h1 {font-size: 1.6em; text-align: left} \ + h2 {font-size: 1em; text-align: left} \ + .copyright {font-size: 0.6em} \ + .caption {font-size: 0.6em}' + + def get_cover_url(self): + soup = self.index_to_soup(self.cover_source) + img_span = soup.find('span', {'class':re.compile('coverimage')}) + self.cover_url = img_span.find('img', src=True)['src'] + return self.cover_url def preprocess_html(self, soup): + # remove all articles without relevant content + tags = soup.findAll('li', {'class':'tag-chain-item'}) + for li in tags: + if 'BILDERSTRECKE' in self.tag_to_string(li).upper(): + self.abort_article() + # remove list of tags + tags = soup.find('ul', {'class':'tag-chain'}) + if tags: + tags.extract() + # remove all style attributes for item in soup.findAll(style=True): del item['style'] - for item in soup.findAll('div', attrs={'class':'artikelsplit'}): - item.extract() - self.append_page(soup, soup.body, 3) - pager = soup.find('div',attrs={'class':'artikelsplit'}) - if pager: - pager.extract() - return self.adeify_images(soup) - - - - remove_tags = [dict(attrs={'class':['navSeitenAlle', 'kommentieren', 'teaserheader', 'teasercontent', 'info', 'zwischenhead', 'artikelsplit']}), - dict(id=['topNav', 'mainNav', 'subNav', 'socialmedia', 'footerRahmen', 'gatrixx_marktinformationen', 'pager', 'weitere']), - dict(span=['ratingtext', 'Gesamtranking', 'h3','']), - dict(rel=['canonical'])] - + # remove all local hyperlinks + for a in soup.findAll('a', {'href':True}): + if a['href'] and 'http' not in a['href']: + del a['href'] + # remove picture(s) of author(s) + for div in soup.findAll('div', {'class':'ce_text block'}): + if div.find('hr'): + for hr in div.findAll('hr'): + hr.extract() + for img in div.findAll('img'): + img.extract() + return soup