calibre/recipes/focus_pl.recipe
2011-10-02 08:50:47 -06:00

67 lines
2.7 KiB
Plaintext

# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class Focus_pl(BasicNewsRecipe):
title = u'Focus.pl'
oldest_article = 15
max_articles_per_feed = 100
__author__ = 'fenuks'
language = 'pl'
description ='polish scientific monthly magazine'
category='magazine'
cover_url=''
remove_empty_feeds= True
no_stylesheets=True
remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
remove_tags_after=dict(name='div', attrs={'class':'clear'})
feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
(u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
(u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
(u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
(u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
(u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
(u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
(u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
(u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
]
def skip_ad_pages(self, soup):
tag=soup.find(name='a')
if tag:
new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
return new_soup
def append_page(self, appendtag):
tag=appendtag.find(name='div', attrs={'class':'arrows'})
if tag:
nexturl='http://www.focus.pl/'+tag.a['href']
for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
while nexturl:
soup2=self.index_to_soup(nexturl)
nexturl=None
pagetext=soup2.find(name='div', attrs={'class':'txt'})
tag=pagetext.find(name='div', attrs={'class':'arrows'})
for r in tag.findAll(name='a'):
if u'Następne' in r.string:
nexturl='http://www.focus.pl/'+r['href']
for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
rem.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
tag=soup.find(name='div', attrs={'class':'clr fl'})
if tag:
self.cover_url='http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url)
def preprocess_html(self, soup):
self.append_page(soup.body)
return soup