import re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1359406781(BasicNewsRecipe): title = u'Private Eye' publication_type = 'magazine' description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop' oldest_article = 13 max_articles_per_feed = 100 remove_empty_feeds = True remove_javascript = True no_stylesheets = True ignore_duplicate_articles = {'title'} language = 'en_GB' encoding = 'cp1252' __author__ = u'MartynPritchard@yahoo.com' __copyright__ = '2014, Martyn Pritchard ' def get_cover_url(self): cover_url = None soup = self.index_to_soup('http://www.private-eye.co.uk/current_issue.php') for citem in soup.findAll('img'): if citem['src'].endswith('big.jpg'): return 'http://www.private-eye.co.uk/' + citem['src'] return cover_url remove_tags_before = {'class':"article"} remove_tags_after = {'id' : "nav-box-sections-mobile"} remove_tags_after = {'class' : "gap-biggest"} remove_tags_after = {'id' : "subscribe-here"} remove_tags = [dict(name='td', attrs={'class':'sub_dave'})] remove_tags = [dict(name='div', attrs={'class':'footer-block'})] remove_tags = [dict(name='div', attrs={'class':'sub-nav-bar'})] preprocess_regexps = [ (re.compile(r'../grfx', re.DOTALL|re.IGNORECASE), lambda match: 'http://www.private-eye.co.uk/grfx'), (re.compile(r'More From This Issue.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'More top stories in the latest issue:.*', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'Also Available Online.*', re.DOTALL|re.IGNORECASE), lambda match: ''), ] feeds = [(u'Private Eye', u'https://dl.dropboxusercontent.com/u/10483931/PrivateEyeStat.xml')]