from calibre.web.feeds.news import BasicNewsRecipe import re class Gildia(BasicNewsRecipe): title = u'Gildia.pl' __author__ = 'fenuks' description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' # noqa cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg' category = 'culture' cover_url = 'http://portal.gildia.pl/images/logo-main.png' language = 'pl' oldest_article = 8 max_articles_per_feed = 100 remove_empty_feeds = True no_stylesheets = True use_embedded_content = False ignore_duplicate_articles = {'title', 'url'} preprocess_regexps = [(re.compile(u''), lambda match: '')] ignore_duplicate_articles = {'title', 'url'} remove_tags = [dict(name='div', attrs={'class': [ 'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})] keep_only_tags = [dict(name='div', attrs={'class': 'widetext'}), dict(name='article', attrs={'id': re.compile(r'post-\d+')})] feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'), (u'Literatura', u'http://www.literatura.gildia.pl/rss'), (u'Film', u'http://www.film.gildia.pl/rss'), (u'Horror', u'http://www.horror.gildia.pl/rss'), (u'Konwenty', u'http://www.konwenty.gildia.pl/rss'), (u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'), (u'Manga i anime', u'http://www.manga.gildia.pl/rss'), (u'Star Wars', u'http://www.starwars.gildia.pl/rss'), (u'Techno', u'http://www.techno.gildia.pl/rss'), (u'Historia', u'http://www.historia.gildia.pl/rss'), (u'Magia', u'http://www.magia.gildia.pl/rss'), (u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'), (u'RPG', u'http://www.rpg.gildia.pl/rss'), (u'LARP', u'http://www.larp.gildia.pl/rss'), (u'Muzyka', u'http://www.muzyka.gildia.pl/rss'), (u'Nauka', u'http://www.nauka.gildia.pl/rss'), ] def skip_ad_pages(self, soup): content = soup.find('div', attrs={'class': 'news'}) if content is None: return words = ('recenzj', 'zapowied', 'fragmen', 'relacj', 'wywiad', 'nominacj') document_title = soup.title.renderContents().decode('utf-8').lower() for word in words: if word in document_title: for link in content.findAll(name='a'): if word in link['href'] or (link.string and word in link.string): return self.index_to_soup(link['href'], raw=True) for tag in content.findAll(name='a', href=re.compile('/publicystyka/')): if 'Więcej...' == tag.string: return self.index_to_soup(tag['href'], raw=True) def preprocess_html(self, soup): title = soup.title.renderContents().decode('utf-8').lower() for a in soup('a', href=True): if not a['href'].startswith('http'): if '/gry/' in a['href']: a['href'] = 'http://www.gry.gildia.pl' + a['href'] elif u'książk' in title or u'komiks' in title: a['href'] = 'http://www.literatura.gildia.pl' + a['href'] elif u'komiks' in title: a['href'] = 'http://www.literatura.gildia.pl' + a['href'] else: a['href'] = 'http://www.gildia.pl' + a['href'] return soup