From b4cff43ee25fa2180d43d9ed6f2665d31613fee1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 10 Jan 2010 21:44:48 -0700
Subject: [PATCH] New recipe for The Force by Krittika Goyal

---
 resources/recipes/starwars.recipe | 57 +++++++++++++++++++++++++++++++
 src/calibre/web/fetch/simple.py   |  2 +-
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 resources/recipes/starwars.recipe

diff --git a/resources/recipes/starwars.recipe b/resources/recipes/starwars.recipe
new file mode 100644
index 0000000000..bb04e1ff6b
--- /dev/null
+++ b/resources/recipes/starwars.recipe
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class TheForce(BasicNewsRecipe):
+    title          = u'The Force'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    encoding = 'cp1252'
+
+    remove_stylesheets = True
+    #remove_javascripts = True
+    conversion_options = { 'linearize_tables' : True }
+    remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
+    keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
+    #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
+    remove_tags = [
+       dict(name='iframe'),
+       #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
+       #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
+       #dict(name='table', attrs={'cellspacing':'0'}),
+       #dict(name='ul', attrs={'class':'articleTools'}),	
+    ]
+
+    feeds          = [
+('The Force', 
+ 'http://www.theforce.net/outnews/tfnrdf.xml'),
+]
+
+    def preprocess_html(self, soup):
+        for tag in soup.findAll(name='i'):
+            if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
+               for x in tag.findAllNext():
+                   x.extract()
+               tag.extract()
+               break
+        tag = soup.find(attrs={'class':'articleoption'})
+        if tag is not None:
+            tag = tag.findParent('table')
+            if tag is not None:
+                for x in tag.findAllNext():
+                    x.extract()
+            tag.extract()
+
+        for img in soup.findAll('img', src=True):
+            a = img.findParent('a', href=True)
+            if a is None: continue
+            url = a.get('href').split('?')[-1].partition('=')[-1]
+            if url:
+                img.extract()
+                a.name = 'img'
+                a['src'] = url
+                del a['href']
+                img['src'] = url
+        return soup
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index 7101facbf9..620850a762 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -158,7 +158,7 @@ class RecursiveFetcher(object):
                 pass
 
         def remove_beyond(tag, next):
-            while tag is not None and tag.name != 'body':
+            while tag is not None and getattr(tag, 'name', None) != 'body':
                 after = getattr(tag, next)
                 while after is not None:
                     ns = getattr(tag, next)