calibre/recipes/dzieje_pl.recipe

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Comment


class Dzieje(BasicNewsRecipe):
    title = u'dzieje.pl'
    __author__ = 'fenuks'
    description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.'  # noqa
    cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
    category = 'history'
    language = 'pl'
    ignore_duplicate_articles = {'title', 'url'}
    extra_css = '.imagecache-default {float:left; margin-right:20px;}'
    index = 'http://dzieje.pl'
    oldest_article = 8
    max_articles_per_feed = 100
    remove_javascript = True
    no_stylesheets = True
    keep_only_tags = [
        dict(name='h1', attrs={'class': 'title'}), dict(id='content-area')]
    remove_tags = [dict(attrs={'class': 'field field-type-computed field-field-tagi'}),
                   dict(id='dogory'), dict(name='blockquote')]

    def append_page(self, soup, appendtag):
        tag = appendtag.find('li', attrs={'class': 'pager-next'})
        if tag:
            while tag:
                url = tag.a['href']
                if not url.startswith('http'):
                    url = 'http://dzieje.pl' + tag.a['href']
                soup2 = self.index_to_soup(url)
                pagetext = soup2.find(
                    id='content-area').find(attrs={'class': 'content'})
                for r in pagetext.findAll(attrs={'class': ['fieldgroup group-groupkul', 'fieldgroup group-zdjeciekult', 'fieldgroup group-zdjecieciekaw', 'fieldgroup group-zdjecieksiazka', 'fieldgroup group-zdjeciedu', 'field field-type-filefield field-field-zdjecieglownawyd']}):  # noqa
                    r.extract()
                comments = pagetext.findAll(
                    text=lambda text: isinstance(text, Comment))
                # appendtag.insert(pos, pagetext)
                tag = soup2.find('li', attrs={'class': 'pager-next'})
            for r in appendtag.findAll(attrs={'class': ['item-list', 'field field-type-computed field-field-tagi', ]}):
                r.extract()
            comments = appendtag.findAll(
                text=lambda text: isinstance(text, Comment))
            for comment in comments:
                comment.extract()

    def find_articles(self, url):
        articles = []
        soup = self.index_to_soup(url)
        tag = soup.find(id='content-area').div.div
        for i in tag.findAll('div', recursive=False):
            temp = i.find(attrs={'class': 'views-field-title'}).span.a
            title = temp.string
            url = self.index + temp['href']
            # i.find(attrs={'class':'views-field-created'}).span.string
            date = ''
            articles.append({'title': title,
                             'url': url,
                             'date': date,
                             'description': ''
                             })
        return articles

    def parse_index(self):
        feeds = []
        feeds.append((u"Wiadomości", self.find_articles(
            'http://dzieje.pl/wiadomosci')))
        feeds.append((u"Kultura i sztuka", self.find_articles(
            'http://dzieje.pl/kulturaisztuka')))
        feeds.append((u"Film", self.find_articles('http://dzieje.pl/kino')))
        feeds.append((u"Rozmaitości historyczne",
                      self.find_articles('http://dzieje.pl/rozmaitości')))
        feeds.append(
            (u"Książka", self.find_articles('http://dzieje.pl/ksiazka')))
        feeds.append(
            (u"Wystawa", self.find_articles('http://dzieje.pl/wystawa')))
        feeds.append((u"Edukacja", self.find_articles(
            'http://dzieje.pl/edukacja')))
        feeds.append((u"Dzieje się", self.find_articles(
            'http://dzieje.pl/wydarzenia')))
        return feeds

    def preprocess_html(self, soup):
        for a in soup('a', href=True):
            if not a['href'].startswith('http'):
                a['href'] = self.index + a['href']
        self.append_page(soup, soup.body)
        return soup