from calibre.web.feeds.news import BasicNewsRecipe class Cracked(BasicNewsRecipe): title = u'Cracked.com' __author__ = 'UnWeave' language = 'en' description = "America's Only HumorSite since 1958" publisher = 'Cracked' category = 'comedy, lists' oldest_article = 3 #days max_articles_per_feed = 100 no_stylesheets = True encoding = 'ascii' remove_javascript = True use_embedded_content = False feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ] conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } remove_tags_before = dict(id='PrimaryContent') remove_tags_after = dict(name='div', attrs={'class':'shareBar'}) remove_tags = [ dict(name='div', attrs={'class':['social', 'FacebookLike', 'shareBar' ]}), dict(name='div', attrs={'id':['inline-share-buttons', ]}), dict(name='span', attrs={'class':['views', 'KonaFilter' ]}), #dict(name='img'), ] def appendPage(self, soup, appendTag, position): # Check if article has multiple pages pageNav = soup.find('nav', attrs={'class':'PaginationContent'}) if pageNav: # Check not at last page nextPage = pageNav.find('a', attrs={'class':'next'}) if nextPage: nextPageURL = nextPage['href'] nextPageSoup = self.index_to_soup(nextPageURL) # 8th
tag contains article content nextPageContent = nextPageSoup.findAll('section')[7] newPosition = len(nextPageContent.contents) self.appendPage(nextPageSoup,nextPageContent,newPosition) nextPageContent.extract() pageNav.extract() appendTag.insert(position,nextPageContent) def preprocess_html(self, soup): self.appendPage(soup, soup.body, 3) return soup