calibre/recipes/ceska_pozice.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe

class ceskaPoziceRecipe(BasicNewsRecipe):
    __author__  = 'bubak'
    title = u'Česká pozice'
    description = 'Česká pozice'
    oldest_article = 2
    max_articles_per_feed = 20

    feeds = [
        (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'),
        (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'),
        (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'),
        (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed')
            ]


    language = 'cs'
    cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png'
    remove_javascript = True
    no_stylesheets = True
    domain = u'http://www.ceskapozice.cz'
    use_embedded_content = False


    remove_tags = [dict(name='div',   attrs={'class':['block-ad', 'region region-content-ad']}),
               dict(name='ul',   attrs={'class':'links'}),
               dict(name='div',   attrs={'id':['comments', 'back-to-top']}),
               dict(name='div',   attrs={'class':['next-page', 'region region-content-ad']}),
           dict(name='cite')]

    keep_only_tags = [dict(name='div',   attrs={'id':'content'})]

    visited_urls = {}
    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        if url in self.visited_urls:
            self.log.debug('Ignoring duplicate: ' + url)
            return None
        else:
            self.visited_urls[url] = True
            self.log.debug('Accepting: ' + url)
            return url

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        return soup

    def append_page(self, soup, appendtag, position):
        pager = soup.find('div', attrs={'class':'paging-bottom'})
        if pager:
            nextbutton = pager.find('li', attrs={'class':'pager-next'})
            if nextbutton:
                nexturl = self.domain + nextbutton.a['href']
                soup2 = self.index_to_soup(nexturl)
                texttag = soup2.find('div', attrs={'class':'main-body'})
                for it in texttag.findAll('div', attrs={'class':'region region-content-ad'}):
                    it.extract()
                    for it in texttag.findAll('cite'):
                        it.extract()
                        newpos = len(texttag.contents)
                        self.append_page(soup2, texttag, newpos)
                        texttag.extract()
                        appendtag.insert(position, texttag)
                        pager.extract()