From 7316bcad840152475908cc584718436f27f2e730 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 25 Nov 2011 19:21:33 +0530 Subject: [PATCH] ... --- recipes/guardian.recipe | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe index f063934b3d..840e8302af 100644 --- a/recipes/guardian.recipe +++ b/recipes/guardian.recipe @@ -9,6 +9,7 @@ www.guardian.co.uk from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from datetime import date +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class Guardian(BasicNewsRecipe): @@ -16,16 +17,19 @@ class Guardian(BasicNewsRecipe): if date.today().weekday() == 6: base_url = "http://www.guardian.co.uk/theobserver" cover_pic = 'Observer digital edition' + masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif' else: base_url = "http://www.guardian.co.uk/theguardian" cover_pic = 'Guardian digital edition' + masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif' __author__ = 'Seabound and Sujata Raman' language = 'en_GB' - oldest_article = 7 - max_articles_per_feed = 100 - remove_javascript = True + oldest_article = 7 + max_articles_per_feed = 100 + remove_javascript = True + encoding = 'utf-8' # List of section titles to ignore # For example: ['Sport'] @@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe): dict(name='div', attrs={'class':["guardian-tickets promo-component",]}), dict(name='ul', attrs={'class':["pagination"]}), dict(name='ul', attrs={'id':["content-actions"]}), + # article history link + dict(name='a', attrs={'class':["rollover history-link"]}), + # "a version of this article ..." speil + dict(name='div' , attrs = { 'class' : ['section']}), + # "about this article" js dialog + dict(name='div', attrs={'class':["share-top",]}), + # author picture + dict(name='img', attrs={'class':["contributor-pic-small"]}), + # embedded videos/captions + dict(name='span',attrs={'class' : ['inline embed embed-media']}), #dict(name='img'), ] use_embedded_content = False @@ -67,6 +81,13 @@ class Guardian(BasicNewsRecipe): def preprocess_html(self, soup): + # multiple html sections in soup, useful stuff in the first + html = soup.find('html') + soup2 = BeautifulSoup() + soup2.insert(0,html) + + soup = soup2 + for item in soup.findAll(style=True): del item['style'] @@ -74,7 +95,18 @@ class Guardian(BasicNewsRecipe): del item['face'] for tag in soup.findAll(name=['ul','li']): tag.name = 'div' - + + # removes number next to rating stars + items_to_remove = [] + rating_container = soup.find('div', attrs = {'class': ['rating-container']}) + if rating_container: + for item in rating_container: + if isinstance(item, Tag) and str(item.name) == 'span': + items_to_remove.append(item) + + for item in items_to_remove: + item.extract() + return soup def find_sections(self):