diff --git a/recipes/bild_de.recipe b/recipes/bild_de.recipe index 7434b1d47b..7709b67bfb 100644 --- a/recipes/bild_de.recipe +++ b/recipes/bild_de.recipe @@ -1,74 +1,85 @@ -# -*- coding: utf-8 -*- +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import unicode_literals, division, absolute_import, print_function + +''' +bild.de +''' + +import re from calibre.web.feeds.recipes import BasicNewsRecipe + class AdvancedUserRecipe1303841067(BasicNewsRecipe): - - title = u'Bild.de' + title = 'Bild.de' __author__ = 'schuster' - oldest_article = 1 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - language = 'de' - remove_javascript = True + description = 'RSS-Feeds von Bild.de' + language = 'de' + + oldest_article = 1 + max_articles_per_feed = 100 + no_stylesheets = True + remove_javascript = True + use_embedded_content = False + remove_empty_feeds = True -# get cover from myspace - cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg' masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg' -# set what to fetch on the site - remove_tags_before = dict(name = 'h2', attrs={'id':'cover'}) - remove_tags_after = dict(name ='div', attrs={'class':'back'}) + # By default, no local news feeds will be fetched. To change this, + # just uncomment the lines for the regions you are interested in. + feeds = [ + ('Politik', 'http://www.bild.de/rss-feeds/rss-16725492,feed=politik.bild.html'), + ('Unterhaltung', 'http://www.bild.de/rss-feeds/rss-16725492,feed=unterhaltung.bild.html'), + ('Sport', 'http://www.bild.de/rss-feeds/rss-16725492,feed=sport.bild.html'), + ('Lifestyle', 'http://www.bild.de/rss-feeds/rss-16725492,feed=lifestyle.bild.html'), + ('Ratgeber', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ratgeber.bild.html'), + ('Auto', 'http://www.bild.de/rss-feeds/rss-16725492,feed=auto.bild.html'), + ('Digital', 'http://www.bild.de/rss-feeds/rss-16725492,feed=digital.bild.html'), + ('Spiele', 'http://www.bild.de/rss-feeds/rss-16725492,feed=spiele.bild.html'), + ('Leserreporter', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leserreporter.bild.html'), +# ('Berlin', 'http://www.bild.de/rss-feeds/rss-16725492,feed=Newsticker.bild.html'), +# ('Bremen', 'http://www.bild.de/rss-feeds/rss-16725492,feed=bremen.bild.html'), +# ('Chemnitz', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=ressort-regio-chemnitz.bild.html'), +# ('Dresden', 'http://www.bild.de/rss-feeds/rss-16725492,feed=dresden.bild.html'), +# ('Düsseldorf', 'http://www.bild.de/rss-feeds/rss-16725492,feed=duesseldorf.bild.html'), +# ('Frankfurt/Main', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-frankfurt.bild.html'), +# ('Hamburg', 'http://www.bild.de/rss-feeds/rss-16725492,feed=hamburg.bild.html'), +# ('Hannover', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-hannover.bild.html'), +# ('Köln', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-koeln.bild.html'), +# ('Leipzig', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leipzig.bild.html'), +# ('München', 'http://www.bild.de/rss-feeds/rss-16725492,feed=muenchen.bild.html'), +# ('Ruhrgebiet', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ruhrgebiet.bild.html'), +# ('Saarland', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=regional-saarland.bild.html'), +# ('Stuttgart', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-stuttgart.bild.html') + ] + keep_only_tags = [ + dict(name='article') + ] -# remove things on the site that we don't want - remove_tags = [dict(name='div', attrs={'class':'credit'}), - dict(name='div', attrs={'class':'index'}), - dict(name='div', attrs={'id':'zstart31'}), - dict(name='div', attrs={'class':'hentry'}), - dict(name='div', attrs={'class':'back'}), - dict(name='div', attrs={'class':'pagination'}), - dict(name='div', attrs={'class':'header'}), - dict(name='div', attrs={'class':'element floatL'}), - dict(name='div', attrs={'class':'stWrap'}) -] + remove_tags = [ + dict(name=['aside', 'iframe']), + dict(attrs={'class':['socialbar', 'social-sharing flank', 'vel', 'back']}), + dict(name='img', attrs={'alt':'logo'}), + dict(name='div', attrs={'class':re.compile('infoEl')}), + dict(name='span', attrs={'class':re.compile('loupe')}) + ] + + remove_tags_after = [ + dict(name='div', attrs={'itemprop':re.compile('articleBody')}) + ] -# thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code) -# this one removes a lot of direct-link's def preprocess_html(self, soup): - for alink in soup.findAll('a'): - if alink.string is not None: - tstr = alink.string - alink.replaceWith(tstr) + # skip articles without relevant content + if not soup.find('article'): + self.abort_article() + # remove all style attributes + for item in soup.findAll(attrs={'style':True}): + del item['style'] + # remove
within headlines + for h1 in soup.findAll('h1'): + for br in h1.findAll('br'): + br.replaceWith(' ') + # remove all links + for a in soup.findAll('a'): + a.replaceWith(a.renderContents()) return soup - -# remove the ad's - filter_regexps = [r'.\.smartadserver\.com'] - def skip_ad_pages(self, soup): - return None - -#get the real url behind .feedsportal.com and fetch the artikels - def get_article_url(self, article): - return article.get('id', article.get('guid', None)) - -#list of the rss source from www.bild.de - feeds = [(u'Überblick', u'http://rss.bild.de/bild.xml'), - (u'News', u'http://rss.bild.de/bild-news.xml'), - (u'Politik', u'http://rss.bild.de/bild-politik.xml'), - (u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'), - (u'Sport', u'http://rss.bild.de/bild-sport.xml'), - (u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'), - (u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml'), - (u'Reg. - Berlin', u'http://rss.bild.de/bild-berlin.xml'), - (u'Reg. - Bremen', u'http://rss.bild.de/bild-bremen.xml'), - (u'Reg. - Dresden', u'http://rss.bild.de/bild-dresden.xml'), - (u'Reg. - Düsseldorf', u'http://rss.bild.de/bild-duesseldorf.xml'), - (u'Reg. - Frankfurt-Main', u'http://rss.bild.de/bild-frankfurt-main.xml'), - (u'Reg. - Hamburg', u'http://rss.bild.de/bild-hamburg.xml'), - (u'Reg. - Hannover', u'http://rss.bild.de/bild-hannover.xml'), - (u'Reg. - Köln', u'http://rss.bild.de/bild-koeln.xml'), - (u'Reg. - Leipzig', u'http://rss.bild.de/bild-leipzig.xml'), - (u'Reg. - München', u'http://rss.bild.de/bild-muenchen.xml'), - (u'Reg. - Ruhrgebiet', u'http://rss.bild.de/bild-ruhrgebiet.xml'), - (u'Reg. - Stuttgart', u'http://rss.bild.de/bild-stuttgart.xml') -] -