From be76d5b41e2fb298be7e2f63cf11274d2345e66d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 3 Feb 2011 07:57:53 -0700 Subject: [PATCH] Fix #8735 (Updated recipe for The Onion) --- resources/recipes/theonion.recipe | 78 ++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/resources/recipes/theonion.recipe b/resources/recipes/theonion.recipe index 3be4ae4e04..b0eacbb5e0 100644 --- a/resources/recipes/theonion.recipe +++ b/resources/recipes/theonion.recipe @@ -1,7 +1,5 @@ -#!/usr/bin/env python - __license__ = 'GPL v3' -__copyright__ = '2009, Darko Miletic ' +__copyright__ = '2009-2011, Darko Miletic ' ''' theonion.com @@ -12,35 +10,73 @@ from calibre.web.feeds.news import BasicNewsRecipe class TheOnion(BasicNewsRecipe): title = 'The Onion' __author__ = 'Darko Miletic' - description = "America's finest news source" - oldest_article = 2 + description = "America's finest news source" + oldest_article = 2 max_articles_per_feed = 100 - publisher = u'Onion, Inc.' - category = u'humor, news, USA' - language = 'en' - + publisher = 'Onion, Inc.' + category = 'humor, news, USA' + language = 'en' no_stylesheets = True use_embedded_content = False encoding = 'utf-8' - remove_javascript = True - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - - html2lrf_options = [ - '--comment' , description - , '--category' , category - , '--publisher' , publisher - ] + publication_type = 'newsportal' + masthead_url = 'http://o.onionstatic.com/img/headers/onion_190.png' + extra_css = """ + body{font-family: Helvetica,Arial,sans-serif} + .section_title{color: gray; text-transform: uppercase} + .title{font-family: Georgia,serif} + .meta{color: gray; display: inline} + .has_caption{display: block} + .caption{font-size: x-small; color: gray; margin-bottom: 0.8em} + """ - keep_only_tags = [dict(name='div', attrs={'id':'main'})] - + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher': publisher + , 'language' : language + } + + keep_only_tags = [ + dict(name='h2', attrs={'class':['section_title','title']}) + ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']}) + ,dict(attrs={'id':['entries']}) + ] + remove_attributes=['lang','rel'] + remove_tags_after = dict(attrs={'class':['article_body','feature_content']}) remove_tags = [ - dict(name=['object','link','iframe','base']) + dict(name=['object','link','iframe','base','meta']) ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']}) ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']}) ] - + feeds = [ (u'Daily' , u'http://feeds.theonion.com/theonion/daily' ) ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' ) ] + + def get_article_url(self, article): + artl = BasicNewsRecipe.get_article_url(self, article) + if artl.startswith('http://www.theonion.com/audio/'): + artl = None + return artl + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + for item in soup.findAll('a'): + limg = item.find('img') + if item.string is not None: + str = item.string + item.replaceWith(str) + else: + if limg: + item.name = 'div' + item.attrs = [] + if not limg.has_key('alt'): + limg['alt'] = 'image' + else: + str = self.tag_to_string(item) + item.replaceWith(str) + return soup