calibre/recipes/gildia_pl.recipe

72 lines
3.6 KiB
Plaintext

from calibre.web.feeds.news import BasicNewsRecipe
import re
class Gildia(BasicNewsRecipe):
title = u'Gildia.pl'
__author__ = 'fenuks'
description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' # noqa
cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg'
category = 'culture'
cover_url = 'http://portal.gildia.pl/images/logo-main.png'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(u'</?sup>'), lambda match: '')]
ignore_duplicate_articles = {'title', 'url'}
remove_tags = [dict(name='div', attrs={'class': [
'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]
keep_only_tags = [dict(name='div', attrs={'class': 'widetext'}), dict(name='article', attrs={'id': re.compile(r'post-\d+')})]
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'),
(u'Literatura', u'http://www.literatura.gildia.pl/rss'),
(u'Film', u'http://www.film.gildia.pl/rss'),
(u'Horror', u'http://www.horror.gildia.pl/rss'),
(u'Konwenty', u'http://www.konwenty.gildia.pl/rss'),
(u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'),
(u'Manga i anime', u'http://www.manga.gildia.pl/rss'),
(u'Star Wars', u'http://www.starwars.gildia.pl/rss'),
(u'Techno', u'http://www.techno.gildia.pl/rss'),
(u'Historia', u'http://www.historia.gildia.pl/rss'),
(u'Magia', u'http://www.magia.gildia.pl/rss'),
(u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'),
(u'RPG', u'http://www.rpg.gildia.pl/rss'),
(u'LARP', u'http://www.larp.gildia.pl/rss'),
(u'Muzyka', u'http://www.muzyka.gildia.pl/rss'),
(u'Nauka', u'http://www.nauka.gildia.pl/rss'),
]
def skip_ad_pages(self, soup):
content = soup.find('div', attrs={'class': 'news'})
if content is None:
return
words = ('recenzj', 'zapowied', 'fragmen',
'relacj', 'wywiad', 'nominacj')
document_title = soup.title.renderContents().decode('utf-8').lower()
for word in words:
if word in document_title:
for link in content.findAll(name='a'):
if word in link['href'] or (link.string and word in link.string):
return self.index_to_soup(link['href'], raw=True)
for tag in content.findAll(name='a', href=re.compile('/publicystyka/')):
if 'Wi&#281;cej...' == tag.string:
return self.index_to_soup(tag['href'], raw=True)
def preprocess_html(self, soup):
title = soup.title.renderContents().decode('utf-8').lower()
for a in soup('a', href=True):
if not a['href'].startswith('http'):
if '/gry/' in a['href']:
a['href'] = 'http://www.gry.gildia.pl' + a['href']
elif u'książk' in title or u'komiks' in title:
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
elif u'komiks' in title:
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
else:
a['href'] = 'http://www.gildia.pl' + a['href']
return soup