From b4cff43ee25fa2180d43d9ed6f2665d31613fee1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 10 Jan 2010 21:44:48 -0700 Subject: [PATCH] New recipe for The Force by Krittika Goyal --- resources/recipes/starwars.recipe | 57 +++++++++++++++++++++++++++++++ src/calibre/web/fetch/simple.py | 2 +- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 resources/recipes/starwars.recipe diff --git a/resources/recipes/starwars.recipe b/resources/recipes/starwars.recipe new file mode 100644 index 0000000000..bb04e1ff6b --- /dev/null +++ b/resources/recipes/starwars.recipe @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class TheForce(BasicNewsRecipe): + title = u'The Force' + language = 'en' + __author__ = 'Krittika Goyal' + oldest_article = 1 #days + max_articles_per_feed = 25 + encoding = 'cp1252' + + remove_stylesheets = True + #remove_javascripts = True + conversion_options = { 'linearize_tables' : True } + remove_tags_after= dict(name='div', attrs={'class':'KonaBody'}) + keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'}) + #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'}) + remove_tags = [ + dict(name='iframe'), + #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}), + #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}), + #dict(name='table', attrs={'cellspacing':'0'}), + #dict(name='ul', attrs={'class':'articleTools'}), + ] + + feeds = [ +('The Force', + 'http://www.theforce.net/outnews/tfnrdf.xml'), +] + + def preprocess_html(self, soup): + for tag in soup.findAll(name='i'): + if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag): + for x in tag.findAllNext(): + x.extract() + tag.extract() + break + tag = soup.find(attrs={'class':'articleoption'}) + if tag is not None: + tag = tag.findParent('table') + if tag is not None: + for x in tag.findAllNext(): + x.extract() + tag.extract() + + for img in soup.findAll('img', src=True): + a = img.findParent('a', href=True) + if a is None: continue + url = a.get('href').split('?')[-1].partition('=')[-1] + if url: + img.extract() + a.name = 'img' + a['src'] = url + del a['href'] + img['src'] = url + return soup diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 7101facbf9..620850a762 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -158,7 +158,7 @@ class RecursiveFetcher(object): pass def remove_beyond(tag, next): - while tag is not None and tag.name != 'body': + while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next)