mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update bild.de
This commit is contained in:
parent
d8f2808c6f
commit
1b7c86dbaf
@ -1,74 +1,85 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
'''
|
||||
bild.de
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
|
||||
|
||||
title = u'Bild.de'
|
||||
title = 'Bild.de'
|
||||
__author__ = 'schuster'
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
language = 'de'
|
||||
remove_javascript = True
|
||||
description = 'RSS-Feeds von Bild.de'
|
||||
language = 'de'
|
||||
|
||||
oldest_article = 1
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
use_embedded_content = False
|
||||
remove_empty_feeds = True
|
||||
|
||||
# get cover from myspace
|
||||
cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
|
||||
masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
|
||||
|
||||
# set what to fetch on the site
|
||||
remove_tags_before = dict(name = 'h2', attrs={'id':'cover'})
|
||||
remove_tags_after = dict(name ='div', attrs={'class':'back'})
|
||||
# By default, no local news feeds will be fetched. To change this,
|
||||
# just uncomment the lines for the regions you are interested in.
|
||||
feeds = [
|
||||
('Politik', 'http://www.bild.de/rss-feeds/rss-16725492,feed=politik.bild.html'),
|
||||
('Unterhaltung', 'http://www.bild.de/rss-feeds/rss-16725492,feed=unterhaltung.bild.html'),
|
||||
('Sport', 'http://www.bild.de/rss-feeds/rss-16725492,feed=sport.bild.html'),
|
||||
('Lifestyle', 'http://www.bild.de/rss-feeds/rss-16725492,feed=lifestyle.bild.html'),
|
||||
('Ratgeber', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ratgeber.bild.html'),
|
||||
('Auto', 'http://www.bild.de/rss-feeds/rss-16725492,feed=auto.bild.html'),
|
||||
('Digital', 'http://www.bild.de/rss-feeds/rss-16725492,feed=digital.bild.html'),
|
||||
('Spiele', 'http://www.bild.de/rss-feeds/rss-16725492,feed=spiele.bild.html'),
|
||||
('Leserreporter', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leserreporter.bild.html'),
|
||||
# ('Berlin', 'http://www.bild.de/rss-feeds/rss-16725492,feed=Newsticker.bild.html'),
|
||||
# ('Bremen', 'http://www.bild.de/rss-feeds/rss-16725492,feed=bremen.bild.html'),
|
||||
# ('Chemnitz', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=ressort-regio-chemnitz.bild.html'),
|
||||
# ('Dresden', 'http://www.bild.de/rss-feeds/rss-16725492,feed=dresden.bild.html'),
|
||||
# ('Düsseldorf', 'http://www.bild.de/rss-feeds/rss-16725492,feed=duesseldorf.bild.html'),
|
||||
# ('Frankfurt/Main', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-frankfurt.bild.html'),
|
||||
# ('Hamburg', 'http://www.bild.de/rss-feeds/rss-16725492,feed=hamburg.bild.html'),
|
||||
# ('Hannover', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-hannover.bild.html'),
|
||||
# ('Köln', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-koeln.bild.html'),
|
||||
# ('Leipzig', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leipzig.bild.html'),
|
||||
# ('München', 'http://www.bild.de/rss-feeds/rss-16725492,feed=muenchen.bild.html'),
|
||||
# ('Ruhrgebiet', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ruhrgebiet.bild.html'),
|
||||
# ('Saarland', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=regional-saarland.bild.html'),
|
||||
# ('Stuttgart', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-stuttgart.bild.html')
|
||||
]
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='article')
|
||||
]
|
||||
|
||||
# remove things on the site that we don't want
|
||||
remove_tags = [dict(name='div', attrs={'class':'credit'}),
|
||||
dict(name='div', attrs={'class':'index'}),
|
||||
dict(name='div', attrs={'id':'zstart31'}),
|
||||
dict(name='div', attrs={'class':'hentry'}),
|
||||
dict(name='div', attrs={'class':'back'}),
|
||||
dict(name='div', attrs={'class':'pagination'}),
|
||||
dict(name='div', attrs={'class':'header'}),
|
||||
dict(name='div', attrs={'class':'element floatL'}),
|
||||
dict(name='div', attrs={'class':'stWrap'})
|
||||
]
|
||||
remove_tags = [
|
||||
dict(name=['aside', 'iframe']),
|
||||
dict(attrs={'class':['socialbar', 'social-sharing flank', 'vel', 'back']}),
|
||||
dict(name='img', attrs={'alt':'logo'}),
|
||||
dict(name='div', attrs={'class':re.compile('infoEl')}),
|
||||
dict(name='span', attrs={'class':re.compile('loupe')})
|
||||
]
|
||||
|
||||
remove_tags_after = [
|
||||
dict(name='div', attrs={'itemprop':re.compile('articleBody')})
|
||||
]
|
||||
|
||||
# thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
|
||||
# this one removes a lot of direct-link's
|
||||
def preprocess_html(self, soup):
|
||||
for alink in soup.findAll('a'):
|
||||
if alink.string is not None:
|
||||
tstr = alink.string
|
||||
alink.replaceWith(tstr)
|
||||
# skip articles without relevant content
|
||||
if not soup.find('article'):
|
||||
self.abort_article()
|
||||
# remove all style attributes
|
||||
for item in soup.findAll(attrs={'style':True}):
|
||||
del item['style']
|
||||
# remove <br> within headlines
|
||||
for h1 in soup.findAll('h1'):
|
||||
for br in h1.findAll('br'):
|
||||
br.replaceWith(' ')
|
||||
# remove all links
|
||||
for a in soup.findAll('a'):
|
||||
a.replaceWith(a.renderContents())
|
||||
return soup
|
||||
|
||||
# remove the ad's
|
||||
filter_regexps = [r'.\.smartadserver\.com']
|
||||
def skip_ad_pages(self, soup):
|
||||
return None
|
||||
|
||||
#get the real url behind .feedsportal.com and fetch the artikels
|
||||
def get_article_url(self, article):
|
||||
return article.get('id', article.get('guid', None))
|
||||
|
||||
#list of the rss source from www.bild.de
|
||||
feeds = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
|
||||
(u'News', u'http://rss.bild.de/bild-news.xml'),
|
||||
(u'Politik', u'http://rss.bild.de/bild-politik.xml'),
|
||||
(u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
|
||||
(u'Sport', u'http://rss.bild.de/bild-sport.xml'),
|
||||
(u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
|
||||
(u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml'),
|
||||
(u'Reg. - Berlin', u'http://rss.bild.de/bild-berlin.xml'),
|
||||
(u'Reg. - Bremen', u'http://rss.bild.de/bild-bremen.xml'),
|
||||
(u'Reg. - Dresden', u'http://rss.bild.de/bild-dresden.xml'),
|
||||
(u'Reg. - Düsseldorf', u'http://rss.bild.de/bild-duesseldorf.xml'),
|
||||
(u'Reg. - Frankfurt-Main', u'http://rss.bild.de/bild-frankfurt-main.xml'),
|
||||
(u'Reg. - Hamburg', u'http://rss.bild.de/bild-hamburg.xml'),
|
||||
(u'Reg. - Hannover', u'http://rss.bild.de/bild-hannover.xml'),
|
||||
(u'Reg. - Köln', u'http://rss.bild.de/bild-koeln.xml'),
|
||||
(u'Reg. - Leipzig', u'http://rss.bild.de/bild-leipzig.xml'),
|
||||
(u'Reg. - München', u'http://rss.bild.de/bild-muenchen.xml'),
|
||||
(u'Reg. - Ruhrgebiet', u'http://rss.bild.de/bild-ruhrgebiet.xml'),
|
||||
(u'Reg. - Stuttgart', u'http://rss.bild.de/bild-stuttgart.xml')
|
||||
]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user