Fix #8735 (Updated recipe for The Onion)

2025-08-30 23:00:21 -04:00 · 2011-02-03 07:57:53 -07:00 · 2011-02-03 07:57:53 -07:00 · be76d5b41e
commit be76d5b41e
parent ac40d2e66d
1 changed files with 57 additions and 21 deletions
--- a/resources/recipes/theonion.recipe
+++ b/resources/recipes/theonion.recipe
@ -1,7 +1,5 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'

 '''
 theonion.com
@ -15,26 +13,39 @@ class TheOnion(BasicNewsRecipe):
    description           = "America's finest news source"
    oldest_article        = 2
    max_articles_per_feed = 100
-    publisher             = u'Onion, Inc.'
-    category              = u'humor, news, USA'    
-    language = 'en'
-
+    publisher             = 'Onion, Inc.'
+    category              = 'humor, news, USA'
+    language              = 'en'
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf-8'
-    remove_javascript     = True
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' 
+    publication_type      = 'newsportal'
+    masthead_url          = 'http://o.onionstatic.com/img/headers/onion_190.png'
+    extra_css             = """
+                                body{font-family: Helvetica,Arial,sans-serif}
+                                .section_title{color: gray; text-transform: uppercase}
+                                .title{font-family: Georgia,serif}
+                                .meta{color: gray; display: inline}
+                                .has_caption{display: block}
+                                .caption{font-size: x-small; color: gray; margin-bottom: 0.8em}
+                            """

-    html2lrf_options = [
-                          '--comment'       , description
-                        , '--category'      , category
-                        , '--publisher'     , publisher
-                        ]
-
-    keep_only_tags = [dict(name='div', attrs={'id':'main'})]
+    conversion_options = {
+                          'comment'  : description
+                        , 'tags'     : category
+                        , 'publisher': publisher
+                        , 'language' : language
+                        }

+    keep_only_tags = [
+                         dict(name='h2', attrs={'class':['section_title','title']})
+                        ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
+                        ,dict(attrs={'id':['entries']})
+                     ]
+    remove_attributes=['lang','rel']
+    remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
    remove_tags = [
-                     dict(name=['object','link','iframe','base'])
+                     dict(name=['object','link','iframe','base','meta'])
                    ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
                    ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
                  ]
@ -44,3 +55,28 @@ class TheOnion(BasicNewsRecipe):
              (u'Daily'  , u'http://feeds.theonion.com/theonion/daily' )
             ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
            ]
+
+    def get_article_url(self, article):
+        artl = BasicNewsRecipe.get_article_url(self, article)
+        if artl.startswith('http://www.theonion.com/audio/'):
+           artl = None
+        return artl
+
+    def preprocess_html(self, soup):
+        for item in soup.findAll(style=True):
+            del item['style']
+        for item in soup.findAll('a'):
+            limg = item.find('img')
+            if item.string is not None:
+               str = item.string
+               item.replaceWith(str)
+            else:
+               if limg:
+                  item.name  = 'div'
+                  item.attrs = []
+                  if not limg.has_key('alt'):
+                     limg['alt'] = 'image'
+               else:
+                   str = self.tag_to_string(item)
+                   item.replaceWith(str)
+        return soup