# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai from __future__ import unicode_literals from calibre.web.feeds.recipes import BasicNewsRecipe import re class bleskRecipe(BasicNewsRecipe): __author__ = 'bubak' title = u'Blesk' publisher = u'' description = 'blesk.cz' oldest_article = 1 max_articles_per_feed = 20 use_embedded_content = False feeds = [ (u'Zprávy', u'http://www.blesk.cz/rss/7'), (u'Blesk', u'http://www.blesk.cz/rss/1'), (u'Sex a tabu', u'http://www.blesk.cz/rss/2'), (u'Celebrity', u'http://www.blesk.cz/rss/5'), (u'Cestování', u'http://www.blesk.cz/rss/12') ] language = 'cs' cover_url = 'http://img.blesk.cz/images/blesk/blesk-logo.png' remove_javascript = True no_stylesheets = True extra_css = """ """ remove_attributes = [] remove_tags_before = dict(name='div', attrs={'id': ['boxContent']}) remove_tags_after = dict(name='div', attrs={'class': ['artAuthors']}) remove_tags = [dict(name='div', attrs={'class': ['link_clanek']}), dict(name='div', attrs={'id': ['partHeader']}), dict(name='div', attrs={'id': ['top_bottom_box', 'lista_top']})] preprocess_regexps = [(re.compile(r'
')] keep_only_tags = [dict(name='div', attrs={'class': 'articleContent'})] visited_urls = {} def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if url in self.visited_urls: self.log.debug('Ignoring duplicate: ' + url) return None else: self.visited_urls[url] = True self.log.debug('Accepting: ' + url) return url