from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1305470859(BasicNewsRecipe): title = u'Capital.de' language = 'de' __author__ = 'schuster' oldest_article =7 max_articles_per_feed = 35 no_stylesheets = True remove_javascript = True use_embedded_content = False masthead_url = 'http://www.wirtschaftsmedien-shop.de/media/stores/wirtschaftsmedien/capital/teaser_large_abo.jpg' cover_url = 'http://d1kb9jvg6ylufe.cloudfront.net/WebsiteCMS/de/unternehmen/linktipps/mainColumn/08/image/DE_Capital_bis20mm_SW.jpg' extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} img {min-width:300px; max-width:600px; min-height:300px; max-height:800px} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' def print_version(self, url): return url.replace ('nv=rss#utm_source=rss2&utm_medium=rss_feed&utm_campaign=/', 'mode=print') remove_tags_bevor = [dict(name='td', attrs={'class':'textcell'})] remove_tags_after = [dict(name='div', attrs={'class':'artikelsplit'})] feeds = [ (u'Wirtschaftsmagazin', u'http://www.capital.de/rss/'), (u'Unternehmen', u'http://www.capital.de/rss/unternehmen'), (u'Finanz & Geldanlage', u'http://www.capital.de/rss/finanzen/geldanlage')] def append_page(self, soup, appendtag, position): pager = soup.find('div',attrs={'class':'artikelsplit'}) if pager: nexturl = self.INDEX + pager.a['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class':'printable'}) for it in texttag.findAll(style=True): del it['style'] newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) texttag.extract() appendtag.insert(position,texttag) def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('div', attrs={'class':'artikelsplit'}): item.extract() self.append_page(soup, soup.body, 3) pager = soup.find('div',attrs={'class':'artikelsplit'}) if pager: pager.extract() return self.adeify_images(soup) remove_tags = [dict(attrs={'class':['navSeitenAlle', 'kommentieren', 'teaserheader', 'teasercontent', 'info', 'zwischenhead', 'artikelsplit']}), dict(id=['topNav', 'mainNav', 'subNav', 'socialmedia', 'footerRahmen', 'gatrixx_marktinformationen', 'pager', 'weitere']), dict(span=['ratingtext', 'Gesamtranking', 'h3','']), dict(rel=['canonical'])]