From 6bd12a445002c5ea6c65ccccd236e50698ae2b9c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 7 Apr 2013 16:39:01 +0530 Subject: [PATCH] Update The Onion --- recipes/theonion.recipe | 46 +++++++++-------------------------------- 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/recipes/theonion.recipe b/recipes/theonion.recipe index b0eacbb5e0..d177e0978d 100644 --- a/recipes/theonion.recipe +++ b/recipes/theonion.recipe @@ -36,47 +36,21 @@ class TheOnion(BasicNewsRecipe): , 'publisher': publisher , 'language' : language } - - keep_only_tags = [ - dict(name='h2', attrs={'class':['section_title','title']}) - ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']}) - ,dict(attrs={'id':['entries']}) - ] - remove_attributes=['lang','rel'] - remove_tags_after = dict(attrs={'class':['article_body','feature_content']}) + keep_only_tags = [dict(name='article', attrs={'class':'full-article'})] remove_tags = [ - dict(name=['object','link','iframe','base','meta']) - ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']}) - ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']}) - ] - + dict(name=['nav', 'aside', 'section', 'meta']), + {'attrs':{'class':lambda x: x and ('share-tools' in x or 'ad-zone' in x)}}, + ] feeds = [ (u'Daily' , u'http://feeds.theonion.com/theonion/daily' ) ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' ) ] - def get_article_url(self, article): - artl = BasicNewsRecipe.get_article_url(self, article) - if artl.startswith('http://www.theonion.com/audio/'): - artl = None - return artl - - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] - for item in soup.findAll('a'): - limg = item.find('img') - if item.string is not None: - str = item.string - item.replaceWith(str) - else: - if limg: - item.name = 'div' - item.attrs = [] - if not limg.has_key('alt'): - limg['alt'] = 'image' - else: - str = self.tag_to_string(item) - item.replaceWith(str) + def preprocess_html(self, soup, *args): + for img in soup.findAll('img', attrs={'data-src':True}): + if img['data-src']: + img['src'] = img['data-src'] return soup + +