from calibre.web.feeds.news import BasicNewsRecipe class Cracked(BasicNewsRecipe): title = u'Cracked.com' __author__ = 'UnWeave' language = 'en' description = "America's Only HumorSite since 1958" publisher = 'Cracked' category = 'comedy, lists' oldest_article = 3 # days max_articles_per_feed = 100 no_stylesheets = True encoding = 'utf-8' remove_javascript = True use_embedded_content = False recursions = 11 remove_attributes = ['size', 'style'] feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')] conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } keep_only_tags = [ dict(name='div', attrs={ 'class': ['content-content', 'contentWrapper', 'content-header',]}), dict(name='article', attrs={'class': [ 'module article dropShadowBottomCurved', 'module blog dropShadowBottomCurved',]}), ] remove_tags = [ dict(name='section', attrs={ 'class': ['socialTools', 'quickFixModule', 'continue-reading']}), dict(attrs={'class':['socialShareAfterContent', 'socialShareModule', 'continue-reading', 'social-share-bottom list-inline']}), dict(name='div', attrs={'id': ['relatedArticle', 'content-card-top', 'recommendedForYourPleasure', 'navbar', 'flashbackModuleWrap', 'moreRecommendedArticles']}), dict(name='div', attrs={'class': [ 'comments-wrap', 'container continue-reading', 'row breadcrumbs-wrapper', 'btn-social-favorites col', 'hidden-social col', 'ajax-loader comments-loader-bottom', 'flashback-module-new', 'card-md-list card-sm-list card-xs-list', 'popular-module card-md-list card-sm-list card-xs-list', 'col-md-12 list-title', 'content-cards d-flex flex-wrap', 'google-plus btn btn-social', 'twitter btn btn-social', 'facebook btn btn-social', 'row social-share-top-wrapper']}), dict(name='h4', attrs={'class': ['mobile-ad-label']}), dict(name='ul', attrs={'id': ['breadcrumbs', 'socialShare']}), dict(name='ul', attrs={'class': ['list-unstyled offcanvas-sections']}), dict(name='div', attrs={ 'class': ['bannerAd hidden-sm hidden-md hidden-lg introAd']}) ] def is_link_wanted(self, url, a): return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-img':True}): img['src'] = img['data-img'] for img in soup.findAll('img', attrs={'data-original':True}): img['src'] = img['data-original'] for img in soup.findAll('img', attrs={'data-src':True}): img['src'] = img['data-src'] return soup def postprocess_html(self, soup, first_fetch): for div in soup.findAll(attrs={'class':'PaginationContent'}): div.extract() if not first_fetch: for div in soup.findAll(attrs={'class':'meta'}): div.extract() return soup