New recipe for The Force by Krittika Goyal

2025-07-09 03:04:10 -04:00 · 2010-01-10 21:44:48 -07:00 · 2010-01-10 21:44:48 -07:00 · b4cff43ee2
commit b4cff43ee2
parent e37f0747db
2 changed files with 58 additions and 1 deletions
--- a/resources/recipes/starwars.recipe
+++ b/resources/recipes/starwars.recipe
@ -0,0 +1,57 @@
 # -*- coding: utf-8 -*-
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class TheForce(BasicNewsRecipe):
    title          = u'The Force'
    language       = 'en'
    __author__     = 'Krittika Goyal'
    oldest_article = 1 #days
    max_articles_per_feed = 25
    encoding = 'cp1252'
    remove_stylesheets = True
    #remove_javascripts = True
    conversion_options = { 'linearize_tables' : True }
    remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
    keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
    #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
    remove_tags = [
       dict(name='iframe'),
       #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
       #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
       #dict(name='table', attrs={'cellspacing':'0'}),
       #dict(name='ul', attrs={'class':'articleTools'}),	
    ]
    feeds          = [
 ('The Force', 
 'http://www.theforce.net/outnews/tfnrdf.xml'),
 ]
    def preprocess_html(self, soup):
        for tag in soup.findAll(name='i'):
            if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
               for x in tag.findAllNext():
                   x.extract()
               tag.extract()
               break
        tag = soup.find(attrs={'class':'articleoption'})
        if tag is not None:
            tag = tag.findParent('table')
            if tag is not None:
                for x in tag.findAllNext():
                    x.extract()
            tag.extract()
        for img in soup.findAll('img', src=True):
            a = img.findParent('a', href=True)
            if a is None: continue
            url = a.get('href').split('?')[-1].partition('=')[-1]
            if url:
                img.extract()
                a.name = 'img'
                a['src'] = url
                del a['href']
                img['src'] = url
        return soup
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -158,7 +158,7 @@ class RecursiveFetcher(object):
                pass
        def remove_beyond(tag, next):
-            while tag is not None and tag.name != 'body':
+            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)