calibre/recipes/cracked_com.recipe

from calibre.web.feeds.news import BasicNewsRecipe


class Cracked(BasicNewsRecipe):
    title = u'Cracked.com'
    __author__ = 'UnWeave'
    language = 'en'
    description = "America's Only HumorSite since 1958"
    publisher = 'Cracked'
    category = 'comedy, lists'
    oldest_article = 3  # days
    max_articles_per_feed = 100
    no_stylesheets = True
    encoding = 'utf-8'
    remove_javascript = True
    use_embedded_content = False
    recursions = 11
    remove_attributes = ['size', 'style']

    feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    keep_only_tags = [dict(name='article', attrs={'class': 'module article dropShadowBottomCurved'}),
                      dict(name='article', attrs={'class': 'module blog dropShadowBottomCurved'})]

    remove_tags = [
        dict(name='section', attrs={
             'class': ['socialTools', 'quickFixModule']}),
        dict(
            attrs={'class': ['socialShareAfterContent', 'socialShareModule']}),
    ]

    def is_link_wanted(self, url, a):
        return a['class'] == 'next' and a.findParent('nav', attrs={'class': 'PaginationContent'}) is not None

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-img': True}):
            img['src'] = img['data-img']
        return soup

    def postprocess_html(self, soup, first_fetch):
        for div in soup.findAll(attrs={'class': 'PaginationContent'}):
            div.extract()
        if not first_fetch:
            for h1 in soup.findAll('h1'):
                h1.extract()
            for div in soup.findAll(attrs={'class': 'meta'}):
                div.extract()
        return soup