New recipe for The Force by Krittika Goyal

This commit is contained in:
Kovid Goyal 2010-01-10 21:44:48 -07:00
parent e37f0747db
commit b4cff43ee2
2 changed files with 58 additions and 1 deletions

View File

@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class TheForce(BasicNewsRecipe):
title = u'The Force'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
encoding = 'cp1252'
remove_stylesheets = True
#remove_javascripts = True
conversion_options = { 'linearize_tables' : True }
remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
#keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
remove_tags = [
dict(name='iframe'),
#dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
#dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
#dict(name='table', attrs={'cellspacing':'0'}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
feeds = [
('The Force',
'http://www.theforce.net/outnews/tfnrdf.xml'),
]
def preprocess_html(self, soup):
for tag in soup.findAll(name='i'):
if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
for x in tag.findAllNext():
x.extract()
tag.extract()
break
tag = soup.find(attrs={'class':'articleoption'})
if tag is not None:
tag = tag.findParent('table')
if tag is not None:
for x in tag.findAllNext():
x.extract()
tag.extract()
for img in soup.findAll('img', src=True):
a = img.findParent('a', href=True)
if a is None: continue
url = a.get('href').split('?')[-1].partition('=')[-1]
if url:
img.extract()
a.name = 'img'
a['src'] = url
del a['href']
img['src'] = url
return soup

View File

@ -158,7 +158,7 @@ class RecursiveFetcher(object):
pass pass
def remove_beyond(tag, next): def remove_beyond(tag, next):
while tag is not None and tag.name != 'body': while tag is not None and getattr(tag, 'name', None) != 'body':
after = getattr(tag, next) after = getattr(tag, next)
while after is not None: while after is not None:
ns = getattr(tag, next) ns = getattr(tag, next)