Fix cracked.com

2025-07-09 03:04:10 -04:00 · 2011-07-04 07:58:38 -06:00 · 2011-07-04 07:58:38 -06:00 · a2c6d7c214
commit a2c6d7c214
parent 3e4517b3c4
1 changed files with 39 additions and 59 deletions
--- a/recipes/cracked_com.recipe
+++ b/recipes/cracked_com.recipe
@ -1,83 +1,63 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 class Cracked(BasicNewsRecipe):
    title                 = u'Cracked.com'
-    __author__            = u'Nudgenudge'
+    __author__            = 'UnWeave'
    language              = 'en'
-    description            = 'America''s Only Humor and Video Site, since 1958'
+    description           = "America's Only HumorSite since 1958"
    publisher             = 'Cracked'
    category              = 'comedy, lists'
-    oldest_article        = 2
+    oldest_article        = 3 #days
-    delay                 = 10
+    max_articles_per_feed = 100
    max_articles_per_feed = 2
    no_stylesheets        = True
-    encoding              = 'cp1252'
+    encoding              = 'ascii'
    remove_javascript     = True
    use_embedded_content  = False
-    INDEX                 = u'http://www.cracked.com'
+
-    extra_css             = """
+    feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ]
                                .pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
                                .pageheader_title{font-size: xx-large; color: #394128}
                                .pageheader_byline{font-size: small; font-weight: bold; color: #394128}
                                .score_bg {display: inline; width: 100%; margin-bottom: 2em}
                                .score_column_1{ padding-left: 10px; font-size: small; width: 50%}
                                .score_column_2{ padding-left: 10px; font-size: small; width: 50%}
                                .score_column_3{ padding-left: 10px; font-size: small; width: 50%}
                                .score_header{font-size: large; color: #50544A}
                                .bodytext{display: block}
                                body{font-family: Helvetica,Arial,sans-serif}
                            """
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        , 'linearize_tables' : True
                        }
-    keep_only_tags    =  [
+    remove_tags_before = dict(id='PrimaryContent')
                        dict(name='div', attrs={'class':['Column1']})
                        ]
-    feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS')]
+    remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
-    def get_article_url(self, article):
+    remove_tags = [ dict(name='div', attrs={'class':['social',
-        return article.get('guid',  None)
+                                                     'FacebookLike',
                                                     'shareBar'
                                                     ]}),
-    def cleanup_page(self, soup):
+                    dict(name='div', attrs={'id':['inline-share-buttons',
-        for item in soup.findAll(style=True):
+                                                  ]}),
            del item['style']
        for alink in soup.findAll('a'):
            if alink.string is not None:
                tstr = alink.string
                alink.replaceWith(tstr)
        for div_to_remove in soup.findAll('div', attrs={'id':['googlead_1','fb-like-article','comments_section']}):
            div_to_remove.extract()
        for div_to_remove in soup.findAll('div', attrs={'class':['share_buttons_col_1','GenericModule1']}):
            div_to_remove.extract()
        for div_to_remove in soup.findAll('div', attrs={'class':re.compile("prev_next")}):
            div_to_remove.extract()
        for ul_to_remove in soup.findAll('ul', attrs={'class':['Nav6']}):
            ul_to_remove.extract()
        for image in soup.findAll('img', attrs={'alt': 'article image'}):
            image.extract()
-    def append_page(self, soup, appendtag, position):
+                    dict(name='span', attrs={'class':['views',
-        pager = soup.find('a',attrs={'class':'next_arrow_active'})
+                                                      'KonaFilter'
-        if pager:
+                                                      ]}),
-            nexturl = self.INDEX + pager['href']
+                    #dict(name='img'),
-            soup2 = self.index_to_soup(nexturl)
+                    ]
-            texttag = soup2.find('div', attrs={'class':re.compile("userStyled")})
+
-            newpos = len(texttag.contents)
+    def appendPage(self, soup, appendTag, position):
-            self.append_page(soup2,texttag,newpos)
+        # Check if article has multiple pages
-            texttag.extract()
+        pageNav = soup.find('nav', attrs={'class':'PaginationContent'})
-            self.cleanup_page(appendtag)
+        if pageNav:
-            appendtag.insert(position,texttag)
+            # Check not at last page
-        else:
+            nextPage = pageNav.find('a', attrs={'class':'next'})
-            self.cleanup_page(appendtag)
+            if nextPage:
                nextPageURL = nextPage['href']
                nextPageSoup = self.index_to_soup(nextPageURL)
                # 8th <section> tag contains article content
                nextPageContent = nextPageSoup.findAll('section')[7]
                newPosition = len(nextPageContent.contents)
                self.appendPage(nextPageSoup,nextPageContent,newPosition)
                nextPageContent.extract()
                pageNav.extract()
                appendTag.insert(position,nextPageContent)
    def preprocess_html(self, soup):
-        self.append_page(soup, soup.body, 3)
+        self.appendPage(soup, soup.body, 3)
-        return self.adeify_images(soup)
+        return soup