From cbee0d32bf9f6ace041a5b5c5cf3e4bcebe5c9a2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 19 Aug 2009 10:24:43 -0600 Subject: [PATCH] Improved recipe for The Guardian --- .../web/feeds/recipes/recipe_guardian.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_guardian.py b/src/calibre/web/feeds/recipes/recipe_guardian.py index f8543c7d59..58e1c3e706 100644 --- a/src/calibre/web/feeds/recipes/recipe_guardian.py +++ b/src/calibre/web/feeds/recipes/recipe_guardian.py @@ -8,17 +8,16 @@ www.guardian.co.uk ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Guardian(BasicNewsRecipe): title = u'The Guardian' - __author__ = 'Seabound' + __author__ = 'Seabound and Sujata Raman' language = _('English') oldest_article = 7 max_articles_per_feed = 20 remove_javascript = True - + timefmt = ' [%a, %d %b %Y]' keep_only_tags = [ dict(name='div', attrs={'id':["content","article_header","main-article-info",]}), @@ -30,20 +29,20 @@ class Guardian(BasicNewsRecipe): dict(name='ul', attrs={'id':["content-actions"]}), ] use_embedded_content = False - + no_stylesheets = True extra_css = ''' .article-attributes{font-size: x-small; font-family:Arial,Helvetica,sans-serif;} .h1{font-size: large ;font-family:georgia,serif; font-weight:bold;} .stand-first-alone{color:#666666; font-size:small; font-family:Arial,Helvetica,sans-serif;} .caption{color:#666666; font-size:x-small; font-family:Arial,Helvetica,sans-serif;} - #article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;} + #article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} .main-article-info{font-family:Arial,Helvetica,sans-serif;} - #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;} - #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;} + #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} + #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} ''' - - + + feeds = [ ('Front Page', 'http://www.guardian.co.uk/rss'), @@ -57,21 +56,30 @@ class Guardian(BasicNewsRecipe): ('Comment','http://www.guardian.co.uk/commentisfree/rss'), ] - + def get_article_url(self, article): + url = article.get('guid', None) + if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \ + '/gallery/' in url or 'ivebeenthere' in url or \ + 'pickthescore' in url or 'audioslideshow' in url : + url = None + return url + + + def preprocess_html(self, soup): - - for item in soup.findAll(style=True): + + for item in soup.findAll(style=True): del item['style'] - - for item in soup.findAll(face=True): + + for item in soup.findAll(face=True): del item['face'] for tag in soup.findAll(name=['ul','li']): tag.name = 'div' - + return soup - - - - + + + +