calibre/recipes/bay_citizen.recipe

from calibre.web.feeds.news import BasicNewsRecipe


class TheBayCitizen(BasicNewsRecipe):
    title = 'The Bay Citizen'
    language = 'en'
    __author__ = 'noah'
    description = 'The Bay Citizen'
    publisher = 'The Bay Citizen'
    INDEX = u'http://www.baycitizen.org'
    category = 'news'
    oldest_article = 2
    max_articles_per_feed = 20
    no_stylesheets = True
    masthead_url = 'http://media.baycitizen.org/images/layout/logo1.png'
    feeds = [('Main Feed', 'http://www.baycitizen.org/feeds/stories/')]
    keep_only_tags = [dict(name='div', attrs={'class': 'story'})]
    remove_tags = [
        dict(name='div', attrs={'class': 'socialBar'}),
        dict(name='div', attrs={'id': 'text-resize'}),
        dict(name='div', attrs={'class': 'story relatedContent'}),
        dict(name='div', attrs={'id': 'comment_status_loading'}),
    ]

    def append_page(self, soup, appendtag, position):
        pager = soup.find('a', attrs={'class': 'stry-next'})
        if pager:
            nexturl = self.INDEX + pager['href']
            soup2 = self.index_to_soup(nexturl)
            texttag = soup2.find('div', attrs={'class': 'body'})
            for it in texttag.findAll(style=True):
                del it['style']
            newpos = len(texttag.contents)
            self.append_page(soup2, texttag, newpos)
            texttag.extract()
            appendtag.insert(position, texttag)

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        self.append_page(soup, soup.body, 3)
        garbage = soup.findAll(id='story-pagination')
        [trash.extract() for trash in garbage]
        garbage = soup.findAll('em', 'cont-from-prev')
        [trash.extract() for trash in garbage]
        return soup