# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import unicode_literals from calibre.web.feeds.recipes import BasicNewsRecipe class ceskaPoziceRecipe(BasicNewsRecipe): __author__ = 'bubak' title = u'Česká pozice' description = 'Česká pozice' oldest_article = 2 max_articles_per_feed = 20 feeds = [ (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'), (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'), (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'), (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed') ] language = 'cs' cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png' remove_javascript = True no_stylesheets = True domain = u'http://www.ceskapozice.cz' use_embedded_content = False remove_tags = [dict(name='div', attrs={'class': ['block-ad', 'region region-content-ad']}), dict(name='ul', attrs={'class': 'links'}), dict(name='div', attrs={ 'id': ['comments', 'back-to-top']}), dict(name='div', attrs={ 'class': ['next-page', 'region region-content-ad']}), dict(name='cite')] keep_only_tags = [dict(name='div', attrs={'id': 'content'})] visited_urls = {} def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if url in self.visited_urls: self.log.debug('Ignoring duplicate: ' + url) return None else: self.visited_urls[url] = True self.log.debug('Accepting: ' + url) return url def preprocess_html(self, soup): self.append_page(soup, soup.body, 3) return soup def append_page(self, soup, appendtag, position): pager = soup.find('div', attrs={'class': 'paging-bottom'}) if pager: nextbutton = pager.find('li', attrs={'class': 'pager-next'}) if nextbutton: nexturl = self.domain + nextbutton.a['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class': 'main-body'}) for it in texttag.findAll('div', attrs={'class': 'region region-content-ad'}): it.extract() for it in texttag.findAll('cite'): it.extract() newpos = len(texttag.contents) self.append_page(soup2, texttag, newpos) texttag.extract() appendtag.insert(position, texttag) pager.extract()