Various Polish nwes sources by fenuks

This commit is contained in:
Kovid Goyal 2012-02-20 11:11:32 +05:30
parent 35d15d0eb5
commit ac2cc2834c
32 changed files with 548 additions and 0 deletions

View File

@ -0,0 +1,48 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Ciekawostki_Historyczne(BasicNewsRecipe):
title = u'Ciekawostki Historyczne'
oldest_article = 7
__author__ = 'fenuks'
description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.'
category = 'history'
language = 'pl'
masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg'
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?</fb:like>', re.DOTALL), lambda match: ''), (re.compile(ur'<h2>Zobacz też:</h2>.*?</ol>', re.DOTALL), lambda match: '')]
no_stylesheets=True
remove_empty_feeds=True
keep_only_tags=[dict(name='div', attrs={'class':'post'})]
remove_tags=[dict(id='singlepostinfo')]
feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')]
def append_page(self, soup, appendtag):
tag=soup.find(name='h7')
if tag:
if tag.br:
pass
elif tag.nextSibling.name=='p':
tag=tag.nextSibling
nexturl = tag.findAll('a')
for nextpage in nexturl:
tag.extract()
nextpage= nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(name='div', attrs={'class':'post'})
for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}):
r.extract()
for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}):
r.extract()
for r in pagetext.findAll('h1'):
r.extract()
pagetext.find('h6').nextSibling.extract()
pagetext.find('h7').nextSibling.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,21 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Gameplay_pl(BasicNewsRecipe):
title = u'Gameplay.pl'
oldest_article = 7
__author__ = 'fenuks'
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
return url

Binary file not shown.

After

Width:  |  Height:  |  Size: 994 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 991 B

BIN
recipes/icons/in4_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 357 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 808 B

BIN
recipes/icons/kresy_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

BIN
recipes/icons/oclab_pl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 881 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 817 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 366 B

BIN
recipes/icons/pc_arena.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

BIN
recipes/icons/pc_foster.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 694 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 322 B

BIN
recipes/icons/pure_pc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 386 B

BIN
recipes/icons/tanuki.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1017 B

BIN
recipes/icons/tvn24.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

44
recipes/in4_pl.recipe Normal file
View File

@ -0,0 +1,44 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class in4(BasicNewsRecipe):
title = u'IN4.pl'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
preprocess_regexps = [(re.compile(ur'<a title="translate into.*?</a>', re.DOTALL), lambda match: '') ]
keep_only_tags=[dict(name='div', attrs={'class':'left_alone'})]
remove_tags_after=dict(name='img', attrs={'title':'komentarze'})
remove_tags=[dict(name='img', attrs={'title':'komentarze'})]
feeds = [(u'Wiadomo\u015bci', u'http://www.in4.pl/rss.php'), (u'Recenzje', u'http://www.in4.pl/rss_recenzje.php'), (u'Mini recenzje', u'http://www.in4.pl/rss_mini.php')]
def append_page(self, soup, appendtag):
a=soup.findAll('a')
nexturl=None
for i in a:
if i.string and 'następna str' in i.string:
nexturl='http://www.in4.pl/' + i['href']
i.extract()
while nexturl:
soup2 = self.index_to_soup(nexturl)
pagetext = soup2.find(id='news')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
nexturl=None
tag=soup2.findAll('a')
for z in tag:
if z.string and u'następna str' in z.string:
nexturl='http://www.in4.pl/' + z['href']
break
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,18 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Informacje_USA(BasicNewsRecipe):
title = u'Informacje USA'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'portal wiadomości amerykańskich'
category = 'news'
language = 'pl'
masthead_url= 'http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
cover_url='http://www.informacjeusa.com/wp-content/add_images/top_logo_5_2010.jpg'
no_stylesheets = True
preprocess_regexps = [(re.compile(ur'<p>Zobacz:.*?</p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><a href=".*?Zobacz także:.*?</a></p>', re.DOTALL), lambda match: ''), (re.compile(ur'<p><p>Zobacz też:.*?</a></p>', re.DOTALL), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})]
remove_tags_after= dict(attrs={'class':'tags'})
remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})]
feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')]

14
recipes/kresy_pl.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class Kresy(BasicNewsRecipe):
title = u'Kresy'
__author__ = 'fenuks'
description = u'portal społeczności kresowej'
language = 'pl'
masthead_url= 'http://www.kresy.pl/public/img/logo.png'
cover_url= 'http://www.kresy.pl/public/img/logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
keep_only_tags= [dict(id='artykul')]
remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})]
feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')]

31
recipes/oclab_pl.recipe Normal file
View File

@ -0,0 +1,31 @@
from calibre.web.feeds.news import BasicNewsRecipe
class OCLab(BasicNewsRecipe):
title = u'OCLab.pl'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.'
category = 'IT'
language = 'pl'
cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118'
no_stylesheets = True
keep_only_tags=[dict(id='main')]
remove_tags_after= dict(attrs={'class':'single-postmetadata'})
remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})]
feeds = [(u'Wpisy', u'http://oclab.pl/feed/')]
def append_page(self, soup, appendtag):
tag=soup.find(attrs={'class':'contentjumpddl'})
if tag:
nexturl=tag.findAll('option')
for nextpage in nexturl[1:-1]:
soup2 = self.index_to_soup(nextpage['value'])
pagetext = soup2.find(attrs={'class':'single-entry'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,37 @@
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Overclock_pl(BasicNewsRecipe):
title = u'Overclock.pl'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).'
category = 'IT'
language = 'pl'
masthead_url='http://www.overclock.pl/gfx/logo_m.png'
cover_url='http://www.overclock.pl/gfx/logo_m.png'
no_stylesheets = True
remove_empty_feeds = True
preprocess_regexps = [(re.compile(ur'<b>Komentarze do aktualności:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'<h3>Nawigacja</h3>', re.DOTALL), lambda match: '') ]
keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')]
remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})]
feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')]
def append_page(self, soup, appendtag):
tag=soup.find(id='navigation')
if tag:
nexturl=tag.findAll('option')
tag.extract()
for nextpage in nexturl[2:]:
soup2 = self.index_to_soup(nextpage['value'])
pagetext = soup2.find(id='content')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
rem=appendtag.find(attrs={'alt':'Pierwsza'})
if rem:
rem.parent.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

14
recipes/palmtop_pl.recipe Normal file
View File

@ -0,0 +1,14 @@
from calibre.web.feeds.news import BasicNewsRecipe
class palmtop_pl(BasicNewsRecipe):
title = u'Palmtop.pl'
__author__ = 'fenuks'
description = 'wortal technologii mobilnych'
category = 'mobile'
language = 'pl'
cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png'
masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')]

31
recipes/pc_arena.recipe Normal file
View File

@ -0,0 +1,31 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Arena(BasicNewsRecipe):
title = u'PCArena'
oldest_article = 18300
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif'
cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif'
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})]
remove_tags=[dict(attrs={'class':'pages'})]
feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')]
def append_page(self, soup, appendtag):
tag=soup.find(name='div', attrs={'class':'pagNum'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[1:]:
nextpage= 'http://pcarena.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(attrs={'class':'artBody'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,41 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Centre(BasicNewsRecipe):
title = u'PC Centre'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.'
category = 'IT'
language = 'pl'
masthead_url= 'http://pccentre.pl/views/images/logo.gif'
cover_url= 'http://pccentre.pl/views/images/logo.gif'
no_stylesheets = True
keep_only_tags= [dict(id='content')]
remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')]
feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n&section=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n&section=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n&section=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n&section=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n&section=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n&section=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n&section=9')]
def append_page(self, soup, appendtag):
tag=soup.find(name='div', attrs={'class':'pages'})
if tag:
nexturl=tag.findAll('a')
tag.extract()
for nextpage in nexturl[:-1]:
nextpage= 'http://pccentre.pl' + nextpage['href']
soup2 = self.index_to_soup(nextpage)
pagetext = soup2.find(id='content')
rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']})
for r in rem:
r.extract()
rem=pagetext.findAll(id='comments')
for r in rem:
r.extract()
rem=pagetext.findAll('h1')
for r in rem:
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

35
recipes/pc_foster.recipe Normal file
View File

@ -0,0 +1,35 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PC_Foster(BasicNewsRecipe):
title = u'PC Foster'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.'
category = 'IT'
language = 'pl'
masthead_url='http://pcfoster.pl/public/images/logo.png'
cover_url= 'http://pcfoster.pl/public/images/logo.png'
no_stylesheets= True
remove_empty_feeds= True
keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})]
remove_tags=[dict(name='p', attrs={'class':'right'})]
feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')]
def append_page(self, soup, appendtag):
nexturl= appendtag.find(attrs={'alt':u'Następna strona'})
if nexturl:
appendtag.find(attrs={'class':'pager more_top'}).extract()
while nexturl:
nexturl='http://pcfoster.pl' + nexturl.parent['href']
soup2 = self.index_to_soup(nexturl)
nexturl=soup2.find(attrs={'alt':u'Następna strona'})
pagetext = soup2.find(attrs={'class':'content'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':'review_content double'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -0,0 +1,81 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Polska_times(BasicNewsRecipe):
title = u'Polska Times'
__author__ = 'fenuks'
description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.'
category = 'newspaper'
language = 'pl'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17'
oldest_article = 7
max_articles_per_feed = 100
remove_emty_feeds= True
no_stylesheets = True
preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])]
remove_tags_after= dict(id='material-tagi')
remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})]
feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')]
def skip_ad_pages(self, soup):
if 'Advertisement' in soup.title:
nexturl=soup.find('a')['href']
return self.index_to_soup(nexturl, raw=True)
def append_page(self, soup, appendtag):
nexturl=soup.find(id='nastepna_strona')
while nexturl:
soup2= self.index_to_soup(nexturl['href'])
nexturl=soup2.find(id='nastepna_strona')
pagetext = soup2.find(id='tresc')
for dictionary in self.remove_tags:
v=pagetext.findAll(attrs=dictionary['attrs'])
for delete in v:
delete.extract()
for b in pagetext.findAll(name='b'):
if b.string:
if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string:
b.extract()
for center in pagetext.findAll(name='center'):
if center.h4:
if center.h4.a:
center.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
paginator.extract()
def image_article(self, soup, appendtag):
nexturl=soup.find('a', attrs={'class':'nastepna'})
urls=[]
while nexturl:
if nexturl not in urls:
urls.append(nexturl)
else:
break
soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href'])
nexturl=soup2.find('a', attrs={'class':'nastepna'})
if nexturl in urls:
break;
pagetext = soup2.find(id='galeria-material')
pos = len(appendtag.contents)
appendtag.insert(pos, '<br />')
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}):
rem.extract()
for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}):
paginator.extract()
def preprocess_html(self, soup):
if soup.find('a', attrs={'class':'nastepna'}):
self.image_article(soup, soup.body)
elif soup.find(id='nastepna_strona'):
self.append_page(soup, soup.body)
return soup
def get_cover_url(self):
soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/')
self.cover_url=soup.find(id='pojemnik').img['src']
return getattr(self, 'cover_url', self.cover_url)

33
recipes/pure_pc.recipe Normal file
View File

@ -0,0 +1,33 @@
from calibre.web.feeds.news import BasicNewsRecipe
class PurePC(BasicNewsRecipe):
title = u'PurePC'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.'
category = 'IT'
language = 'pl'
masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg'
no_stylesheets = True
keep_only_tags= [dict(id='content')]
remove_tags_after= dict(attrs={'class':'fivestar-widget'})
remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})]
feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')]
def append_page(self, soup, appendtag):
nexturl= appendtag.find(attrs={'class':'pager-next'})
if nexturl:
while nexturl:
soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href'])
nexturl=soup2.find(attrs={'class':'pager-next'})
pagetext = soup2.find(attrs={'class':'article'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

37
recipes/tanuki.recipe Normal file
View File

@ -0,0 +1,37 @@
from calibre.web.feeds.news import BasicNewsRecipe
import re
class tanuki(BasicNewsRecipe):
title = u'Tanuki'
oldest_article = 7
__author__ = 'fenuks'
category = 'anime, manga'
language = 'pl'
max_articles_per_feed = 100
encoding='utf-8'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}'
preprocess_regexps = [(re.compile(ur'<h3><a class="screen".*?</h3>', re.DOTALL), lambda match: ''), (re.compile(ur'<div><a href="/strony/((manga)|(anime))/[0-9]+?/oceny(\-redakcji){0,1}">Zobacz jak ocenili</a></div>', re.DOTALL), lambda match: '')]
remove_empty_feeds= True
no_stylesheets = True
keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})]
remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})]
feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')]
def append_page(self, soup, appendtag):
nexturl= appendtag.find(attrs={'class':'nextarrow'})
if nexturl:
while nexturl:
soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href'])
nexturl=soup2.find(attrs={'class':'nextarrow'})
pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'copycat'})
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':'nextarrow'}):
r.extract()
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

24
recipes/tvn24.recipe Normal file
View File

@ -0,0 +1,24 @@
from calibre.web.feeds.news import BasicNewsRecipe
class tvn24(BasicNewsRecipe):
title = u'TVN24'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'fenuks'
description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
category = 'news'
language = 'pl'
masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
keep_only_tags=[dict(id='tvn24_wiadomosci_detal'), dict(name='h1', attrs={'class':'standardHeader1'}), dict(attrs={'class':['date60m rd5', 'imageBackground fl rd7', 'contentFromCMS']})]
remove_tags_after= dict(name='div', attrs={'class':'socialBoxesBottom'})
remove_tags=[dict(attrs={'class':['tagi_detal', 'socialBoxesBottom', 'twitterBox', 'commentsInfo', 'textSize', 'obj_ukrytydruk obj_ramka1_r', 'related newsNews align-right', 'box', 'newsUserList', 'watchMaterial text']})]
feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), (u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View File

@ -0,0 +1,39 @@
from calibre.web.feeds.news import BasicNewsRecipe
class webhosting_pl(BasicNewsRecipe):
title = u'Webhosting.pl'
__author__ = 'fenuks'
description = 'Webhosting.pl to pierwszy na polskim rynku serwis poruszający w szerokim aspekcie tematy związane z hostingiem, globalną Siecią i usługami internetowymi. Głównym celem przedsięwzięcia jest dostarczanie przydatnej i bogatej merytorycznie wiedzy osobom, które chcą tworzyć i efektywnie wykorzystywać współczesny Internet.'
category = 'web'
language = 'pl'
cover_url='http://webhosting.pl/images/logo.png'
masthead_url='http://webhosting.pl/images/logo.png'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
#keep_only_tags= [dict(name='div', attrs={'class':'content_article'}), dict(attrs={'class':'paging'})]
#remove_tags=[dict(attrs={'class':['tags', 'wykop', 'facebook_button_count', 'article_bottom']})]
feeds = [(u'Newsy', u'http://webhosting.pl/feed/rss/an'),
(u'Artyku\u0142y', u'http://webhosting.pl/feed/rss/aa'),
(u'Software', u'http://webhosting.pl/feed/rss/n/12'),
(u'Internet', u'http://webhosting.pl/feed/rss/n/9'),
(u'Biznes', u'http://webhosting.pl/feed/rss/n/13'),
(u'Bezpiecze\u0144stwo', u'http://webhosting.pl/feed/rss/n/10'),
(u'Blogi', u'http://webhosting.pl/feed/rss/ab'),
(u'Programowanie', u'http://webhosting.pl/feed/rss/n/8'),
(u'Kursy', u'http://webhosting.pl/feed/rss/n/11'),
(u'Tips&Tricks', u'http://webhosting.pl/feed/rss/n/15'),
(u'Imprezy', u'http://webhosting.pl/feed/rss/n/22'),
(u'Wywiady', u'http://webhosting.pl/feed/rss/n/24'),
(u'Porady', u'http://webhosting.pl/feed/rss/n/3027'),
(u'Znalezione w sieci', u'http://webhosting.pl/feed/rss/n/6804'),
(u'Dev area', u'http://webhosting.pl/feed/rss/n/24504'),
(u"Webmaster's blog", u'http://webhosting.pl/feed/rss/n/29195'),
(u'Domeny', u'http://webhosting.pl/feed/rss/n/11513'),
(u'Praktyka', u'http://webhosting.pl/feed/rss/n/2'),
(u'Serwery', u'http://webhosting.pl/feed/rss/n/11514'),
(u'Inne', u'http://webhosting.pl/feed/rss/n/24811'),
(u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')]
def print_version(self, url):
return url.replace('webhosting.pl', 'webhosting.pl/print')