from calibre.web.feeds.news import BasicNewsRecipe class TheBayCitizen(BasicNewsRecipe): title = 'The Bay Citizen' language = 'en' __author__ = 'noah' description = 'The Bay Citizen' publisher = 'The Bay Citizen' INDEX = u'http://www.baycitizen.org' category = 'news' oldest_article = 2 max_articles_per_feed = 20 no_stylesheets = True masthead_url = 'http://media.baycitizen.org/images/layout/logo1.png' feeds = [('Main Feed', 'http://www.baycitizen.org/feeds/stories/')] keep_only_tags = [dict(name='div', attrs={'class': 'story'})] remove_tags = [ dict(name='div', attrs={'class': 'socialBar'}), dict(name='div', attrs={'id': 'text-resize'}), dict(name='div', attrs={'class': 'story relatedContent'}), dict(name='div', attrs={'id': 'comment_status_loading'}), ] def append_page(self, soup, appendtag, position): pager = soup.find('a', attrs={'class': 'stry-next'}) if pager: nexturl = self.INDEX + pager['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class': 'body'}) for it in texttag.findAll(style=True): del it['style'] newpos = len(texttag.contents) self.append_page(soup2, texttag, newpos) texttag.extract() appendtag.insert(position, texttag) def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] self.append_page(soup, soup.body, 3) garbage = soup.findAll(id='story-pagination') [trash.extract() for trash in garbage] garbage = soup.findAll('em', 'cont-from-prev') [trash.extract() for trash in garbage] return soup