Improved recipe for The Guardian

This commit is contained in:
Kovid Goyal 2009-08-19 10:24:43 -06:00
parent cf3582d450
commit cbee0d32bf

View File

@ -8,17 +8,16 @@ www.guardian.co.uk
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Guardian(BasicNewsRecipe):
title = u'The Guardian'
__author__ = 'Seabound'
__author__ = 'Seabound and Sujata Raman'
language = _('English')
oldest_article = 7
max_articles_per_feed = 20
remove_javascript = True
timefmt = ' [%a, %d %b %Y]'
keep_only_tags = [
dict(name='div', attrs={'id':["content","article_header","main-article-info",]}),
@ -30,20 +29,20 @@ class Guardian(BasicNewsRecipe):
dict(name='ul', attrs={'id':["content-actions"]}),
]
use_embedded_content = False
no_stylesheets = True
extra_css = '''
.article-attributes{font-size: x-small; font-family:Arial,Helvetica,sans-serif;}
.h1{font-size: large ;font-family:georgia,serif; font-weight:bold;}
.stand-first-alone{color:#666666; font-size:small; font-family:Arial,Helvetica,sans-serif;}
.caption{color:#666666; font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
#article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;}
#article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
'''
feeds = [
('Front Page', 'http://www.guardian.co.uk/rss'),
@ -57,21 +56,30 @@ class Guardian(BasicNewsRecipe):
('Comment','http://www.guardian.co.uk/commentisfree/rss'),
]
def get_article_url(self, article):
url = article.get('guid', None)
if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \
'/gallery/' in url or 'ivebeenthere' in url or \
'pickthescore' in url or 'audioslideshow' in url :
url = None
return url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(face=True):
for item in soup.findAll(face=True):
del item['face']
for tag in soup.findAll(name=['ul','li']):
tag.name = 'div'
return soup