calibre/recipes/polityka.recipe

#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com'

from calibre.web.feeds.news import BasicNewsRecipe


class Polityka(BasicNewsRecipe):

    title = u'Polityka'
    __author__ = 'matek09'
    description = 'Weekly magazine. Last archive issue'
    encoding = 'utf-8'
    no_stylesheets = True
    language = 'pl'
    remove_javascript = True

    remove_tags_before = dict(dict(name='h2', attrs={'class': 'box_nag'}))
    remove_tags_after = dict(dict(name='div', attrs={'class': 'box_footer'}))

    remove_tags = []
    remove_tags.append(dict(name='h2', attrs={'class': 'box_nag'}))
    remove_tags.append(dict(name='div', attrs={'class': 'box_footer'}))

    extra_css = '''
                    h1 {font-size: x-large; font-weight: bold}
                    '''

    def parse_index(self):
        soup = self.index_to_soup('http://archiwum.polityka.pl/')
        box_img3 = soup.findAll(attrs={'class': 'box_img3'})
        feeds = []
        last = 0
        self.cover_url = 'http://archiwum.polityka.pl' + \
            box_img3[-1].find('img')['src']
        last_edition = 'http://archiwum.polityka.pl' + \
            box_img3[-1].find('a')['href']

        while True:
            index = self.index_to_soup(last_edition)

            box_list = index.findAll('div', attrs={'class': 'box_list'})
            if len(box_list) == 0:
                break

            articles = {}
            for box in box_list:
                for div in box.findAll('div', attrs={'class': 'list_tresc'}):
                    article_page = self.index_to_soup(
                        'http://archiwum.polityka.pl' + div.a['href'],)
                    section = self.tag_to_string(article_page.find(
                        'h2', attrs={'class': 'box_nag'})).split('/')[0].lstrip().rstrip()
                    if section not in articles:
                        articles[section] = []
                    articles[section].append({
                        'title': self.tag_to_string(div.a),
                        'url': 'http://archiwum.polityka.pl' + div.a['href'],
                        'date': '',
                        'description': ''
                    })

            for section in articles:
                feeds.append((section, articles[section]))

            last_edition = last_edition.replace('http://archiwum.polityka.pl/wydanie/' + str(
                last), 'http://archiwum.polityka.pl/wydanie/' + str(last + 1))
            last = last + 1

        return feeds