# -*- coding: utf-8 -*- from calibre.web.feeds.news import BasicNewsRecipe import re class Nowa_Fantastyka(BasicNewsRecipe): title = u'Nowa Fantastyka' oldest_article = 7 __author__ = 'fenuks' __modified_by__ = 'zaslav' language = 'pl' encoding = 'latin2' description = u'Strona dla miłośników fantastyki' category = 'fantasy' masthead_url = 'http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg' # extra_css='.tytul {font-size: 20px;}' #not working max_articles_per_feed = 100 INDEX = 'http://www.fantastyka.pl/' no_stylesheets = True needs_subscription = 'optional' remove_tags_before = dict(attrs={'class': 'naglowek2'}) remove_tags_after = dict(name='form', attrs={'name': 'form1'}) remove_tags = [dict(attrs={'class': ['avatar2', 'belka-margin', 'naglowek2']}), dict(name='span', attrs={'class': 'alert-oceny'}), dict(name='img', attrs={'src': ['obrazki/sledz1.png', 'obrazki/print.gif', 'obrazki/mlnf.gif']}), dict(name='b', text='Dodaj komentarz'), dict(name='a', attrs={'href': 'http://www.fantastyka.pl/10,1727.html'}), dict(name='form')] # noqa preprocess_regexps = [ (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: ''), (re.compile(r'\'), lambda match: '')] def find_articles(self, url): articles = [] soup = self.index_to_soup(url) tag = soup.find(attrs={'class': 'belka1-tlo-m'}) art = tag.findAll(name='a', attrs={'class': 'a-box'}) for i in art: title = i.string url = self.INDEX + i['href'] # date=soup.find(id='footer').ul.li.string[41:-1] articles.append({'title': title, 'url': url, 'date': '', 'description': '' }) return articles def parse_index(self): feeds = [] feeds.append((u"Opowiadania", self.find_articles( 'http://www.fantastyka.pl/3.html'))) feeds.append((u"Publicystyka", self.find_articles( 'http://www.fantastyka.pl/6.html'))) feeds.append((u"Hype Park", self.find_articles( 'http://www.fantastyka.pl/9.html'))) return feeds def get_cover_url(self): soup = self.index_to_soup('http://www.e-kiosk.pl/nowa_fantastyka') self.cover_url = 'http://www.e-kiosk.pl' + \ soup.find(name='a', attrs={'class': 'img'})['href'] return getattr(self, 'cover_url', self.cover_url) def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.fantastyka.pl/') br.select_form(nr=0) br['login'] = self.username br['pass'] = self.password br.submit() return br def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(font=True): del item['font'] for item in soup.findAll(align=True): del item['align'] for item in soup.findAll(name='tr'): item.name = 'div' title = soup.find(attrs={'class': 'tytul'}) if title: title['style'] = 'font-size: 20px; font-weight: bold;' for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa a['href'] = self.INDEX + a['href'] return soup