calibre/recipes/capital_de.recipe
Kovid Goyal 29cd8d64ea
Change shebangs to python from python2
Also remove a few other miscellaneous references to python2
2020-08-22 18:47:51 +05:30

84 lines
2.6 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
'''
capital.de
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1305470859(BasicNewsRecipe):
title = 'Capital.de'
__author__ = 'schuster'
description = 'RSS-Feed von Capital.de'
publisher = 'Gruner+Jahr GmbH & Co KG'
language = 'de'
oldest_article = 14
max_articles_per_feed = 35
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
conversion_options = {'smarten_punctuation': True,
'publisher': publisher}
cover_source = 'http://shop.capital.de/abos/capital/'
masthead_url = 'http://www.capital.de/files/capital/layout/logo.png'
feeds = [
('Capital.de', 'http://www.capital.de/partner-feeds/rss.xml')
]
keep_only_tags = [
dict(name='div', attrs={
'class': 'grid_8 alpha omega layout_full block'})
]
remove_tags = [
dict(name='div', attrs={'class': 'article_header'}),
dict(name='br', attrs={'class': 'clear'})
]
remove_attributes = ['height', 'width']
extra_css = 'h1 {font-size: 1.6em; text-align: left} \
h2 {font-size: 1em; text-align: left} \
.copyright {font-size: 0.6em} \
.caption {font-size: 0.6em}'
def get_cover_url(self):
soup = self.index_to_soup(self.cover_source)
img_span = soup.find('span', {'class': re.compile('coverimage')})
self.cover_url = img_span.find('img', src=True)['src']
return self.cover_url
def preprocess_html(self, soup):
# remove all articles without relevant content
tags = soup.findAll('li', {'class': 'tag-chain-item'})
for li in tags:
if 'BILDERSTRECKE' in self.tag_to_string(li).upper():
self.abort_article()
# remove list of tags
tags = soup.find('ul', {'class': 'tag-chain'})
if tags:
tags.extract()
# remove all style attributes
for item in soup.findAll(style=True):
del item['style']
# remove all local hyperlinks
for a in soup.findAll('a', {'href': True}):
if a['href'] and 'http' not in a['href']:
del a['href']
# remove picture(s) of author(s)
for div in soup.findAll('div', {'class': 'ce_text block'}):
if div.find('hr'):
for hr in div.findAll('hr'):
hr.extract()
for img in div.findAll('img'):
img.extract()
return soup