# -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe class TheForce(BasicNewsRecipe): title = u'The Force' language = 'en' __author__ = 'Krittika Goyal' oldest_article = 1 #days max_articles_per_feed = 25 encoding = 'cp1252' remove_stylesheets = True #remove_javascripts = True conversion_options = { 'linearize_tables' : True } remove_tags_after= dict(name='div', attrs={'class':'KonaBody'}) keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'}) #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'}) remove_tags = [ dict(name='iframe'), #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}), #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}), #dict(name='table', attrs={'cellspacing':'0'}), #dict(name='ul', attrs={'class':'articleTools'}), ] feeds = [ ('The Force', 'http://www.theforce.net/outnews/tfnrdf.xml'), ] def preprocess_html(self, soup): for tag in soup.findAll(name='i'): if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag): for x in tag.findAllNext(): x.extract() tag.extract() break tag = soup.find(attrs={'class':'articleoption'}) if tag is not None: tag = tag.findParent('table') if tag is not None: for x in tag.findAllNext(): x.extract() tag.extract() for img in soup.findAll('img', src=True): a = img.findParent('a', href=True) if a is None: continue url = a.get('href').split('?')[-1].partition('=')[-1] if url: img.extract() a.name = 'img' a['src'] = url del a['href'] img['src'] = url return soup