mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for The Force by Krittika Goyal
This commit is contained in:
parent
e37f0747db
commit
b4cff43ee2
57
resources/recipes/starwars.recipe
Normal file
57
resources/recipes/starwars.recipe
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class TheForce(BasicNewsRecipe):
|
||||||
|
title = u'The Force'
|
||||||
|
language = 'en'
|
||||||
|
__author__ = 'Krittika Goyal'
|
||||||
|
oldest_article = 1 #days
|
||||||
|
max_articles_per_feed = 25
|
||||||
|
encoding = 'cp1252'
|
||||||
|
|
||||||
|
remove_stylesheets = True
|
||||||
|
#remove_javascripts = True
|
||||||
|
conversion_options = { 'linearize_tables' : True }
|
||||||
|
remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
|
||||||
|
keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
|
||||||
|
#keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='iframe'),
|
||||||
|
#dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
|
||||||
|
#dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
|
||||||
|
#dict(name='table', attrs={'cellspacing':'0'}),
|
||||||
|
#dict(name='ul', attrs={'class':'articleTools'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('The Force',
|
||||||
|
'http://www.theforce.net/outnews/tfnrdf.xml'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
for tag in soup.findAll(name='i'):
|
||||||
|
if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
|
||||||
|
for x in tag.findAllNext():
|
||||||
|
x.extract()
|
||||||
|
tag.extract()
|
||||||
|
break
|
||||||
|
tag = soup.find(attrs={'class':'articleoption'})
|
||||||
|
if tag is not None:
|
||||||
|
tag = tag.findParent('table')
|
||||||
|
if tag is not None:
|
||||||
|
for x in tag.findAllNext():
|
||||||
|
x.extract()
|
||||||
|
tag.extract()
|
||||||
|
|
||||||
|
for img in soup.findAll('img', src=True):
|
||||||
|
a = img.findParent('a', href=True)
|
||||||
|
if a is None: continue
|
||||||
|
url = a.get('href').split('?')[-1].partition('=')[-1]
|
||||||
|
if url:
|
||||||
|
img.extract()
|
||||||
|
a.name = 'img'
|
||||||
|
a['src'] = url
|
||||||
|
del a['href']
|
||||||
|
img['src'] = url
|
||||||
|
return soup
|
@ -158,7 +158,7 @@ class RecursiveFetcher(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def remove_beyond(tag, next):
|
def remove_beyond(tag, next):
|
||||||
while tag is not None and tag.name != 'body':
|
while tag is not None and getattr(tag, 'name', None) != 'body':
|
||||||
after = getattr(tag, next)
|
after = getattr(tag, next)
|
||||||
while after is not None:
|
while after is not None:
|
||||||
ns = getattr(tag, next)
|
ns = getattr(tag, next)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user