__license__ = 'GPL v3' __copyright__ = '2009-2011, Darko Miletic ' ''' theonion.com ''' from calibre.web.feeds.news import BasicNewsRecipe class TheOnion(BasicNewsRecipe): title = 'The Onion' __author__ = 'Darko Miletic' description = "America's finest news source" oldest_article = 2 max_articles_per_feed = 100 publisher = 'Onion, Inc.' category = 'humor, news, USA' language = 'en' no_stylesheets = True use_embedded_content = False encoding = 'utf-8' publication_type = 'newsportal' masthead_url = 'http://o.onionstatic.com/img/headers/onion_190.png' extra_css = """ body{font-family: Helvetica,Arial,sans-serif} .section_title{color: gray; text-transform: uppercase} .title{font-family: Georgia,serif} .meta{color: gray; display: inline} .has_caption{display: block} .caption{font-size: x-small; color: gray; margin-bottom: 0.8em} """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher': publisher , 'language' : language } keep_only_tags = [ dict(name='h2', attrs={'class':['section_title','title']}) ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']}) ,dict(attrs={'id':['entries']}) ] remove_attributes=['lang','rel'] remove_tags_after = dict(attrs={'class':['article_body','feature_content']}) remove_tags = [ dict(name=['object','link','iframe','base','meta']) ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']}) ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']}) ] feeds = [ (u'Daily' , u'http://feeds.theonion.com/theonion/daily' ) ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' ) ] def get_article_url(self, article): artl = BasicNewsRecipe.get_article_url(self, article) if artl.startswith('http://www.theonion.com/audio/'): artl = None return artl def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('a'): limg = item.find('img') if item.string is not None: str = item.string item.replaceWith(str) else: if limg: item.name = 'div' item.attrs = [] if not limg.has_key('alt'): limg['alt'] = 'image' else: str = self.tag_to_string(item) item.replaceWith(str) return soup