New recipe for The Force by Krittika Goyal

2025-07-09 03:04:10 -04:00 · 2010-01-10 21:44:48 -07:00 · 2010-01-10 21:44:48 -07:00 · b4cff43ee2
commit b4cff43ee2
parent e37f0747db
2 changed files with 58 additions and 1 deletions
--- a/resources/recipes/starwars.recipe
+++ b/resources/recipes/starwars.recipe
@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class TheForce(BasicNewsRecipe):
+    title          = u'The Force'
+    language       = 'en'
+    __author__     = 'Krittika Goyal'
+    oldest_article = 1 #days
+    max_articles_per_feed = 25
+    encoding = 'cp1252'
+
+    remove_stylesheets = True
+    #remove_javascripts = True
+    conversion_options = { 'linearize_tables' : True }
+    remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
+    keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
+    #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
+    remove_tags = [
+       dict(name='iframe'),
+       #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
+       #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
+       #dict(name='table', attrs={'cellspacing':'0'}),
+       #dict(name='ul', attrs={'class':'articleTools'}),	
+    ]
+
+    feeds          = [
+('The Force', 
+ 'http://www.theforce.net/outnews/tfnrdf.xml'),
+]
+
+    def preprocess_html(self, soup):
+        for tag in soup.findAll(name='i'):
+            if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
+               for x in tag.findAllNext():
+                   x.extract()
+               tag.extract()
+               break
+        tag = soup.find(attrs={'class':'articleoption'})
+        if tag is not None:
+            tag = tag.findParent('table')
+            if tag is not None:
+                for x in tag.findAllNext():
+                    x.extract()
+            tag.extract()
+
+        for img in soup.findAll('img', src=True):
+            a = img.findParent('a', href=True)
+            if a is None: continue
+            url = a.get('href').split('?')[-1].partition('=')[-1]
+            if url:
+                img.extract()
+                a.name = 'img'
+                a['src'] = url
+                del a['href']
+                img['src'] = url
+        return soup
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -158,7 +158,7 @@ class RecursiveFetcher(object):
                pass

        def remove_beyond(tag, next):
-            while tag is not None and tag.name != 'body':
+            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)