mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
88 lines
4.1 KiB
Python
88 lines
4.1 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
|
|
|
'''
|
|
bild.de
|
|
'''
|
|
|
|
import re
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
|
|
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
|
title = 'Bild.de'
|
|
__author__ = 'schuster'
|
|
description = 'RSS-Feeds von Bild.de'
|
|
language = 'de'
|
|
|
|
oldest_article = 1
|
|
max_articles_per_feed = 100
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
use_embedded_content = False
|
|
remove_empty_feeds = True
|
|
|
|
masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
|
|
|
|
# By default, no local news feeds will be fetched. To change this,
|
|
# just uncomment the lines for the regions you are interested in.
|
|
feeds = [
|
|
('Politik', 'http://www.bild.de/rss-feeds/rss-16725492,feed=politik.bild.html'),
|
|
('Unterhaltung', 'http://www.bild.de/rss-feeds/rss-16725492,feed=unterhaltung.bild.html'),
|
|
('Sport', 'http://www.bild.de/rss-feeds/rss-16725492,feed=sport.bild.html'),
|
|
('Lifestyle', 'http://www.bild.de/rss-feeds/rss-16725492,feed=lifestyle.bild.html'),
|
|
('Ratgeber', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ratgeber.bild.html'),
|
|
('Auto', 'http://www.bild.de/rss-feeds/rss-16725492,feed=auto.bild.html'),
|
|
('Digital', 'http://www.bild.de/rss-feeds/rss-16725492,feed=digital.bild.html'),
|
|
('Spiele', 'http://www.bild.de/rss-feeds/rss-16725492,feed=spiele.bild.html'),
|
|
('Leserreporter', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leserreporter.bild.html'),
|
|
# ('Berlin', 'http://www.bild.de/rss-feeds/rss-16725492,feed=Newsticker.bild.html'),
|
|
# ('Bremen', 'http://www.bild.de/rss-feeds/rss-16725492,feed=bremen.bild.html'),
|
|
# ('Chemnitz', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=ressort-regio-chemnitz.bild.html'),
|
|
# ('Dresden', 'http://www.bild.de/rss-feeds/rss-16725492,feed=dresden.bild.html'),
|
|
# ('Düsseldorf', 'http://www.bild.de/rss-feeds/rss-16725492,feed=duesseldorf.bild.html'),
|
|
# ('Frankfurt/Main', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-frankfurt.bild.html'),
|
|
# ('Hamburg', 'http://www.bild.de/rss-feeds/rss-16725492,feed=hamburg.bild.html'),
|
|
# ('Hannover', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-hannover.bild.html'),
|
|
# ('Köln', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-koeln.bild.html'),
|
|
# ('Leipzig', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leipzig.bild.html'),
|
|
# ('München', 'http://www.bild.de/rss-feeds/rss-16725492,feed=muenchen.bild.html'),
|
|
# ('Ruhrgebiet', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ruhrgebiet.bild.html'),
|
|
# ('Saarland', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=regional-saarland.bild.html'),
|
|
# ('Stuttgart', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-stuttgart.bild.html')
|
|
]
|
|
|
|
keep_only_tags = [
|
|
dict(name='article')
|
|
]
|
|
|
|
remove_tags = [
|
|
dict(name=['aside', 'iframe']),
|
|
dict(
|
|
attrs={'class': ['socialbar', 'social-sharing flank', 'vel', 'back']}),
|
|
dict(name='img', attrs={'alt': 'logo'}),
|
|
dict(name='div', attrs={'class': re.compile('infoEl')}),
|
|
dict(name='span', attrs={'class': re.compile('loupe')})
|
|
]
|
|
|
|
remove_tags_after = [
|
|
dict(name='div', attrs={'itemprop': re.compile('articleBody')})
|
|
]
|
|
|
|
def preprocess_html(self, soup):
|
|
# skip articles without relevant content
|
|
if not soup.find('article'):
|
|
self.abort_article()
|
|
# remove all style attributes
|
|
for item in soup.findAll(attrs={'style': True}):
|
|
del item['style']
|
|
# remove <br> within headlines
|
|
for h1 in soup.findAll('h1'):
|
|
for br in h1.findAll('br'):
|
|
br.replaceWith(' ')
|
|
# remove all links
|
|
for a in soup.findAll('a'):
|
|
a.replaceWith(a.renderContents().decode('utf-8'))
|
|
return soup
|