calibre/recipes/focus_pl.recipe

# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe

class Focus_pl(BasicNewsRecipe):
    title          = u'Focus.pl'
    oldest_article = 15
    max_articles_per_feed = 100
    __author__        = 'fenuks'
    language       = 'pl'
    description ='polish scientific monthly magazine'
    category='magazine'
    cover_url=''
    remove_empty_feeds= True
    no_stylesheets=True
    remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
    remove_tags_after=dict(name='div', attrs={'class':'clear'})
    feeds          = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
	(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
	(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
	(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
	(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
	(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
	(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
	(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
	(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),


]

    def skip_ad_pages(self, soup):
          tag=soup.find(name='a')
          if tag:
            new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
            return new_soup

    def append_page(self, appendtag):
        tag=appendtag.find(name='div', attrs={'class':'arrows'})
        if tag:
            nexturl='http://www.focus.pl/'+tag.a['href']
            for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
                rem.extract()
            while nexturl:
                 soup2=self.index_to_soup(nexturl)
                 nexturl=None
                 pagetext=soup2.find(name='div', attrs={'class':'txt'})
                 tag=pagetext.find(name='div', attrs={'class':'arrows'})
                 for r in tag.findAll(name='a'):
                     if u'Następne' in r.string:
                         nexturl='http://www.focus.pl/'+r['href']
                 for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
                     rem.extract()
                 pos = len(appendtag.contents)
                 appendtag.insert(pos, pagetext)

    def get_cover_url(self):
        soup=self.index_to_soup('http://www.focus.pl/magazyn/')
        tag=soup.find(name='div', attrs={'class':'clr fl'})
        if tag:
            self.cover_url='http://www.focus.pl/' + tag.a['href']
            return getattr(self, 'cover_url', self.cover_url)


    def preprocess_html(self, soup):
         self.append_page(soup.body)
         return soup