From 0f5b96c51b8dc0d9515d1177ed5cb094edd0806a Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 11 Mar 2011 09:29:09 -0700 Subject: [PATCH] The Bay Citizen by noah --- resources/recipes/bay_citizen.recipe | 46 ++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 resources/recipes/bay_citizen.recipe diff --git a/resources/recipes/bay_citizen.recipe b/resources/recipes/bay_citizen.recipe new file mode 100644 index 0000000000..e6a6c2b63d --- /dev/null +++ b/resources/recipes/bay_citizen.recipe @@ -0,0 +1,46 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class TheBayCitizen(BasicNewsRecipe): + title = 'The Bay Citizen' + language = 'en' + __author__ = 'noah' + description = 'The Bay Citizen' + publisher = 'The Bay Citizen' + INDEX = u'http://www.baycitizen.org' + category = 'news' + oldest_article = 2 + max_articles_per_feed = 20 + no_stylesheets = True + masthead_url = 'http://media.baycitizen.org/images/layout/logo1.png' + feeds = [('Main Feed', 'http://www.baycitizen.org/feeds/stories/')] + keep_only_tags = [dict(name='div', attrs={'class':'story'})] + remove_tags = [ + dict(name='div', attrs={'class':'socialBar'}), + dict(name='div', attrs={'id':'text-resize'}), + dict(name='div', attrs={'class':'story relatedContent'}), + dict(name='div', attrs={'id':'comment_status_loading'}), + ] + + def append_page(self, soup, appendtag, position): + pager = soup.find('a',attrs={'class':'stry-next'}) + if pager: + nexturl = self.INDEX + pager['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'class':'body'}) + for it in texttag.findAll(style=True): + del it['style'] + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + texttag.extract() + appendtag.insert(position,texttag) + + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + self.append_page(soup, soup.body, 3) + garbage = soup.findAll(id='story-pagination') + [trash.extract() for trash in garbage] + garbage = soup.findAll('em', 'cont-from-prev') + [trash.extract() for trash in garbage] + return soup