mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for The Force by Krittika Goyal
This commit is contained in:
parent
e37f0747db
commit
b4cff43ee2
57
resources/recipes/starwars.recipe
Normal file
57
resources/recipes/starwars.recipe
Normal file
@ -0,0 +1,57 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class TheForce(BasicNewsRecipe):
|
||||
title = u'The Force'
|
||||
language = 'en'
|
||||
__author__ = 'Krittika Goyal'
|
||||
oldest_article = 1 #days
|
||||
max_articles_per_feed = 25
|
||||
encoding = 'cp1252'
|
||||
|
||||
remove_stylesheets = True
|
||||
#remove_javascripts = True
|
||||
conversion_options = { 'linearize_tables' : True }
|
||||
remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
|
||||
keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
|
||||
#keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
#dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
|
||||
#dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
|
||||
#dict(name='table', attrs={'cellspacing':'0'}),
|
||||
#dict(name='ul', attrs={'class':'articleTools'}),
|
||||
]
|
||||
|
||||
feeds = [
|
||||
('The Force',
|
||||
'http://www.theforce.net/outnews/tfnrdf.xml'),
|
||||
]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for tag in soup.findAll(name='i'):
|
||||
if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
|
||||
for x in tag.findAllNext():
|
||||
x.extract()
|
||||
tag.extract()
|
||||
break
|
||||
tag = soup.find(attrs={'class':'articleoption'})
|
||||
if tag is not None:
|
||||
tag = tag.findParent('table')
|
||||
if tag is not None:
|
||||
for x in tag.findAllNext():
|
||||
x.extract()
|
||||
tag.extract()
|
||||
|
||||
for img in soup.findAll('img', src=True):
|
||||
a = img.findParent('a', href=True)
|
||||
if a is None: continue
|
||||
url = a.get('href').split('?')[-1].partition('=')[-1]
|
||||
if url:
|
||||
img.extract()
|
||||
a.name = 'img'
|
||||
a['src'] = url
|
||||
del a['href']
|
||||
img['src'] = url
|
||||
return soup
|
@ -158,7 +158,7 @@ class RecursiveFetcher(object):
|
||||
pass
|
||||
|
||||
def remove_beyond(tag, next):
|
||||
while tag is not None and tag.name != 'body':
|
||||
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = getattr(tag, next)
|
||||
|
Loading…
x
Reference in New Issue
Block a user