calibre/recipes/ceska_pozice.recipe

# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
from calibre.web.feeds.recipes import BasicNewsRecipe


class ceskaPoziceRecipe(BasicNewsRecipe):
    __author__ = 'bubak'
    title = u'Česká pozice'
    description = 'Česká pozice'
    oldest_article = 2
    max_articles_per_feed = 20

    feeds = [
        (u'Všechny články', u'http://www.ceskapozice.cz/rss.xml'),
        (u'Domov', u'http://www.ceskapozice.cz/taxonomy/term/16/feed'),
        (u'Chrono', u'http://www.ceskapozice.cz/chrono/feed'),
        (u'Evropa', u'http://www.ceskapozice.cz/taxonomy/term/17/feed')
    ]

    language = 'cs'
    cover_url = 'http://www.ceskapozice.cz/sites/default/files/cpozice_logo.png'
    remove_javascript = True
    no_stylesheets = True
    domain = u'http://www.ceskapozice.cz'
    use_embedded_content = False

    remove_tags = [dict(name='div',   attrs={'class': ['block-ad', 'region region-content-ad']}),
                   dict(name='ul',   attrs={'class': 'links'}),
                   dict(name='div',   attrs={
                        'id': ['comments', 'back-to-top']}),
                   dict(name='div',   attrs={
                        'class': ['next-page', 'region region-content-ad']}),
                   dict(name='cite')]

    keep_only_tags = [dict(name='div',   attrs={'id': 'content'})]

    visited_urls = {}

    def get_article_url(self, article):
        url = BasicNewsRecipe.get_article_url(self, article)
        if url in self.visited_urls:
            self.log.debug('Ignoring duplicate: ' + url)
            return None
        else:
            self.visited_urls[url] = True
            self.log.debug('Accepting: ' + url)
            return url

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        return soup

    def append_page(self, soup, appendtag, position):
        pager = soup.find('div', attrs={'class': 'paging-bottom'})
        if pager:
            nextbutton = pager.find('li', attrs={'class': 'pager-next'})
            if nextbutton:
                nexturl = self.domain + nextbutton.a['href']
                soup2 = self.index_to_soup(nexturl)
                texttag = soup2.find('div', attrs={'class': 'main-body'})
                for it in texttag.findAll('div', attrs={'class': 'region region-content-ad'}):
                    it.extract()
                    for it in texttag.findAll('cite'):
                        it.extract()
                        newpos = len(texttag.contents)
                        self.append_page(soup2, texttag, newpos)
                        texttag.extract()
                        appendtag.insert(position, texttag)
                        pager.extract()