__license__ = 'GPL v3' __copyright__ = '2010, Darko Miletic ' ''' elsevier.nl ''' from calibre.web.feeds.news import BasicNewsRecipe class Pagina12(BasicNewsRecipe): title = 'Elsevier.nl' __author__ = 'Darko Miletic' description = 'News from Holland' publisher = 'elsevier.nl' category = 'news, politics, Holland' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False language = 'nl' country = 'NL' remove_empty_feeds = True masthead_url = 'http://www.elsevier.nl/static/elsevier/stdimg/logo.gif' extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} ' conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } keep_only_tags = dict(attrs={'id': 'artikel_container'}) remove_tags_before = dict(attrs={'id': 'breadcrumb_container'}) remove_tags_after = dict(attrs={'class': 'author_link'}) remove_tags = [ dict(attrs={'id': 'breadcrumb_container'}), dict( name='div', attrs={'class': 'pullout_vak'}) ] remove_attributes = ['width', 'height'] feeds = [ (u'Laatste nieuws', u'http://www.elsevier.nl/web/RSS/Homepage-RSS.htm?output=xml'), (u'Nederland', u'http://www.elsevier.nl/web/RSS/Nederland-RSS.htm?output=xml'), (u'Politiek', u'http://www.elsevier.nl/web/RSS/Politiek-RSS.htm?output=xml'), (u'Europese Unie', u'http://www.elsevier.nl/web/RSS/Europese-Unie-RSS.htm?output=xml'), (u'Buitenland', u'http://www.elsevier.nl/web/RSS/Buitenland-RSS.htm?output=xml'), (u'Economie', u'http://www.elsevier.nl/web/RSS/Economie-RSS.htm?output=xml'), (u'Wetenschap', u'http://www.elsevier.nl/web/RSS/Wetenschap-RSS.htm?output=xml'), (u'Cultuur & Televisie', u'http://www.elsevier.nl/web/RSS/Cultuur-Televisie-RSS.htm?output=xml'), (u'Society', u'http://www.elsevier.nl/web/RSS/Society-RSS.htm?output=xml'), (u'Internet&/Gadgets', u'http://www.elsevier.nl/web/RSS/Internet-Gadgets-RSS.htm?output=xml'), (u'Comentaren', u'http://www.elsevier.nl/web/RSS/Commentaren-RSS.htm?output=xml') ] def print_version(self, url): return url + '?print=true' def get_article_url(self, article): return article.get('guid', None).rpartition('?')[0] def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] return soup