mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
72 lines
3.6 KiB
Plaintext
72 lines
3.6 KiB
Plaintext
from calibre.web.feeds.news import BasicNewsRecipe
|
|
import re
|
|
|
|
|
|
class Gildia(BasicNewsRecipe):
|
|
title = u'Gildia.pl'
|
|
__author__ = 'fenuks'
|
|
description = u'Fantastyczny Portal Kulturalny - newsy, recenzje, galerie, wywiady. Literatura, film, gry komputerowe i planszowe, komiks, RPG, sklep. Nie lekceważ potęgi wyobraźni!' # noqa
|
|
cover_url = 'http://www.film.gildia.pl/_n_/portal/redakcja/logo/logo-gildia.pl-500.jpg'
|
|
category = 'culture'
|
|
cover_url = 'http://portal.gildia.pl/images/logo-main.png'
|
|
language = 'pl'
|
|
oldest_article = 8
|
|
max_articles_per_feed = 100
|
|
remove_empty_feeds = True
|
|
no_stylesheets = True
|
|
use_embedded_content = False
|
|
ignore_duplicate_articles = {'title', 'url'}
|
|
preprocess_regexps = [(re.compile(u'</?sup>'), lambda match: '')]
|
|
ignore_duplicate_articles = {'title', 'url'}
|
|
remove_tags = [dict(name='div', attrs={'class': [
|
|
'backlink', 'im_img', 'addthis_toolbox addthis_default_style', 'banner-bottom']})]
|
|
keep_only_tags = [dict(name='div', attrs={'class': 'widetext'}), dict(name='article', attrs={'id': re.compile(r'post-\d+')})]
|
|
feeds = [(u'Gry', u'http://www.gry.gildia.pl/rss'),
|
|
(u'Literatura', u'http://www.literatura.gildia.pl/rss'),
|
|
(u'Film', u'http://www.film.gildia.pl/rss'),
|
|
(u'Horror', u'http://www.horror.gildia.pl/rss'),
|
|
(u'Konwenty', u'http://www.konwenty.gildia.pl/rss'),
|
|
(u'Plansz\xf3wki', u'http://www.planszowki.gildia.pl/rss'),
|
|
(u'Manga i anime', u'http://www.manga.gildia.pl/rss'),
|
|
(u'Star Wars', u'http://www.starwars.gildia.pl/rss'),
|
|
(u'Techno', u'http://www.techno.gildia.pl/rss'),
|
|
(u'Historia', u'http://www.historia.gildia.pl/rss'),
|
|
(u'Magia', u'http://www.magia.gildia.pl/rss'),
|
|
(u'Bitewniaki', u'http://www.bitewniaki.gildia.pl/rss'),
|
|
(u'RPG', u'http://www.rpg.gildia.pl/rss'),
|
|
(u'LARP', u'http://www.larp.gildia.pl/rss'),
|
|
(u'Muzyka', u'http://www.muzyka.gildia.pl/rss'),
|
|
(u'Nauka', u'http://www.nauka.gildia.pl/rss'),
|
|
]
|
|
|
|
def skip_ad_pages(self, soup):
|
|
content = soup.find('div', attrs={'class': 'news'})
|
|
if content is None:
|
|
return
|
|
|
|
words = ('recenzj', 'zapowied', 'fragmen',
|
|
'relacj', 'wywiad', 'nominacj')
|
|
document_title = soup.title.renderContents().decode('utf-8').lower()
|
|
for word in words:
|
|
if word in document_title:
|
|
for link in content.findAll(name='a'):
|
|
if word in link['href'] or (link.string and word in link.string):
|
|
return self.index_to_soup(link['href'], raw=True)
|
|
for tag in content.findAll(name='a', href=re.compile('/publicystyka/')):
|
|
if 'Więcej...' == tag.string:
|
|
return self.index_to_soup(tag['href'], raw=True)
|
|
|
|
def preprocess_html(self, soup):
|
|
title = soup.title.renderContents().decode('utf-8').lower()
|
|
for a in soup('a', href=True):
|
|
if not a['href'].startswith('http'):
|
|
if '/gry/' in a['href']:
|
|
a['href'] = 'http://www.gry.gildia.pl' + a['href']
|
|
elif u'książk' in title or u'komiks' in title:
|
|
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
|
elif u'komiks' in title:
|
|
a['href'] = 'http://www.literatura.gildia.pl' + a['href']
|
|
else:
|
|
a['href'] = 'http://www.gildia.pl' + a['href']
|
|
return soup
|