# -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe class TheForce(BasicNewsRecipe): title = u'The Force' language = 'en' __author__ = 'Krittika Goyal' oldest_article = 1 # days max_articles_per_feed = 25 encoding = 'cp1252' remove_stylesheets = True conversion_options = {'linearize_tables': True} remove_tags_after = dict(name='div', attrs={'class': 'KonaBody'}) keep_only_tags = dict( name='td', attrs={'background': '/images/span/tile_story_bgtile.gif'}) remove_tags = [ dict(name='iframe'), ] feeds = [ ('The Force', 'http://www.theforce.net/outnews/tfnrdf.xml'), ] def preprocess_html(self, soup): for tag in soup.findAll(name='i'): if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag): for x in tag.findAllNext(): x.extract() tag.extract() break tag = soup.find(attrs={'class': 'articleoption'}) if tag is not None: tag = tag.findParent('table') if tag is not None: for x in tag.findAllNext(): x.extract() tag.extract() for img in soup.findAll('img', src=True): a = img.findParent('a', href=True) if a is None: continue url = a.get('href').split('?')[-1].partition('=')[-1] if url: img.extract() a.name = 'img' a['src'] = url del a['href'] img['src'] = url return soup