# -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe class Focus_pl(BasicNewsRecipe): title = u'Focus.pl' oldest_article = 15 max_articles_per_feed = 100 __author__ = 'fenuks' language = 'pl' description ='polish scientific monthly magazine' category='magazine' cover_url='' remove_empty_feeds= True no_stylesheets=True remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'}) remove_tags_after=dict(name='div', attrs={'class':'clear'}) feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'), (u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'), (u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'), (u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'), (u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'), (u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'), (u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'), (u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'), (u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'), ] def skip_ad_pages(self, soup): tag=soup.find(name='a') if tag: new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True) return new_soup def append_page(self, appendtag): tag=appendtag.find(name='div', attrs={'class':'arrows'}) if tag: nexturl='http://www.focus.pl/'+tag.a['href'] for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}): rem.extract() while nexturl: soup2=self.index_to_soup(nexturl) nexturl=None pagetext=soup2.find(name='div', attrs={'class':'txt'}) tag=pagetext.find(name='div', attrs={'class':'arrows'}) for r in tag.findAll(name='a'): if u'Następne' in r.string: nexturl='http://www.focus.pl/'+r['href'] for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}): rem.extract() pos = len(appendtag.contents) appendtag.insert(pos, pagetext) def get_cover_url(self): soup=self.index_to_soup('http://www.focus.pl/magazyn/') tag=soup.find(name='div', attrs={'class':'clr fl'}) if tag: self.cover_url='http://www.focus.pl/' + tag.a['href'] return getattr(self, 'cover_url', self.cover_url) def preprocess_html(self, soup): self.append_page(soup.body) return soup