calibre/recipes/cracked_com.recipe

from calibre.web.feeds.news import BasicNewsRecipe

class Cracked(BasicNewsRecipe):
    title                 = u'Cracked.com'
    __author__            = 'UnWeave'
    language              = 'en'
    description           = "America's Only HumorSite since 1958"
    publisher             = 'Cracked'
    category              = 'comedy, lists'
    oldest_article        = 3 #days
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'ascii'
    remove_javascript     = True
    use_embedded_content  = False

    feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ]

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    remove_tags_before = dict(id='PrimaryContent')

    remove_tags_after = dict(name='div', attrs={'class':'shareBar'})

    remove_tags = [ dict(name='div', attrs={'class':['social',
                                                     'FacebookLike',
                                                     'shareBar'
                                                     ]}),

                    dict(name='div', attrs={'id':['inline-share-buttons',
                                                  ]}),

                    dict(name='span', attrs={'class':['views',
                                                      'KonaFilter'
                                                      ]}),
                    #dict(name='img'),
                    ]

    def appendPage(self, soup, appendTag, position):
        # Check if article has multiple pages
        pageNav = soup.find('nav', attrs={'class':'PaginationContent'})
        if pageNav:
            # Check not at last page
            nextPage = pageNav.find('a', attrs={'class':'next'})
            if nextPage:
                nextPageURL = nextPage['href']
                nextPageSoup = self.index_to_soup(nextPageURL)
                # 8th <section> tag contains article content
                nextPageContent = nextPageSoup.findAll('section')[7]
                newPosition = len(nextPageContent.contents)
                self.appendPage(nextPageSoup,nextPageContent,newPosition)
                nextPageContent.extract()
                pageNav.extract()
                appendTag.insert(position,nextPageContent)

    def preprocess_html(self, soup):
        self.appendPage(soup, soup.body, 3)
        return soup