fixes for old recipes made by fenuks

This commit is contained in:
Tomasz Długosz 2013-03-06 20:34:04 +01:00
parent 72b925e22a
commit c9fc48c120
40 changed files with 228 additions and 173 deletions

View File

@ -3,7 +3,7 @@ import re
class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone'
__author__ = 'fenuks'
description = u'Adventure zone - adventure games from A to Z'
description = u'Czytaj więcej o przygodzie - codzienne nowinki. Szukaj u nas solucji i poradników, czytaj recenzje i zapowiedzi. Także galeria, pliki oraz forum dla wszystkich fanów gier przygodowych.'
category = 'games'
language = 'pl'
no_stylesheets = True

View File

@ -5,6 +5,7 @@ class Archeowiesci(BasicNewsRecipe):
__author__ = 'fenuks'
category = 'archeology'
language = 'pl'
description = u'Z pasją o przeszłości'
cover_url='http://archeowiesci.pl/wp-content/uploads/2011/05/Archeowiesci2-115x115.jpg'
oldest_article = 7
needs_subscription='optional'

View File

@ -2,7 +2,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS'
__author__ = 'fenuks'
description = 'AstroNEWS- astronomy every day'
description = u'AstroNEWS regularnie dostarcza wiadomości o wydarzeniach związanych z astronomią i astronautyką. Informujemy o aktualnych odkryciach i wydarzeniach naukowych, zapowiadamy ciekawe zjawiska astronomiczne. Serwis jest częścią portalu astronomicznego AstroNET prowadzonego przez miłośników astronomii i zawodowych astronomów.'
category = 'astronomy, science'
language = 'pl'
oldest_article = 8

View File

@ -13,6 +13,7 @@ class Astroflesz(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
remove_attributes = ['style']
keep_only_tags = [dict(id="k2Container")]
remove_tags_after = dict(name='div', attrs={'class':'itemLinks'})
remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})]

View File

@ -3,7 +3,7 @@ import re
class Astronomia_pl(BasicNewsRecipe):
title = u'Astronomia.pl'
__author__ = 'fenuks'
description = 'Astronomia - polish astronomy site'
description = u'Astronomia.pl jest edukacyjnym portalem skierowanym do uczniów, studentów i miłośników astronomii. Przedstawiamy gwiazdy, planety, galaktyki, czarne dziury i wiele innych tajemnic Wszechświata.'
masthead_url = 'http://www.astronomia.pl/grafika/logo.gif'
cover_url = 'http://www.astronomia.pl/grafika/logo.gif'
category = 'astronomy, science'

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Bash_org_pl(BasicNewsRecipe):
title = u'Bash.org.pl'
__author__ = 'fenuks'
description = 'Bash.org.pl - funny quotations from IRC discussions'
description = 'Bash.org.pl - zabawne cytaty z IRC'
category = 'funny quotations, humour'
language = 'pl'
cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png'

View File

@ -3,14 +3,15 @@ import re
class BenchmarkPl(BasicNewsRecipe):
title = u'Benchmark.pl'
__author__ = 'fenuks'
description = u'benchmark.pl -IT site'
description = u'benchmark.pl, recenzje i testy sprzętu, aktualności, rankingi, sterowniki, porady, opinie'
masthead_url = 'http://www.benchmark.pl/i/logo-footer.png'
cover_url = 'http://www.ieaddons.pl/benchmark/logo_benchmark_new.gif'
cover_url = 'http://www.benchmark.pl/i/logo-dark.png'
category = 'IT'
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets=True
no_stylesheets = True
remove_attributes = ['style']
preprocess_regexps = [(re.compile(ur'<h3><span style="font-size: small;">&nbsp;Zobacz poprzednie <a href="http://www.benchmark.pl/news/zestawienie/grupa_id/135">Opinie dnia:</a></span>.*</body>', re.DOTALL|re.IGNORECASE), lambda match: '</body>'), (re.compile(ur'Więcej o .*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags=[dict(name='div', attrs={'class':['m_zwykly', 'gallery']}), dict(id='article')]
remove_tags_after=dict(name='div', attrs={'class':'body'})
@ -21,17 +22,18 @@ class BenchmarkPl(BasicNewsRecipe):
def append_page(self, soup, appendtag):
nexturl = soup.find('span', attrs={'class':'next'})
while nexturl is not None:
nexturl= self.INDEX + nexturl.parent['href']
soup2 = self.index_to_soup(nexturl)
nexturl=soup2.find('span', attrs={'class':'next'})
nexturl = soup.find(attrs={'class':'next'})
while nexturl:
soup2 = self.index_to_soup(nexturl['href'])
nexturl = soup2.find(attrs={'class':'next'})
pagetext = soup2.find(name='div', attrs={'class':'body'})
appendtag.find('div', attrs={'class':'k_ster'}).extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
if appendtag.find('div', attrs={'class':'k_ster'}) is not None:
if appendtag.find('div', attrs={'class':'k_ster'}):
appendtag.find('div', attrs={'class':'k_ster'}).extract()
for r in appendtag.findAll(attrs={'class':'changePage'}):
r.extract()
def image_article(self, soup, appendtag):

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class CD_Action(BasicNewsRecipe):
title = u'CD-Action'
__author__ = 'fenuks'
description = 'cdaction.pl - polish games magazine site'
description = 'Strona CD-Action (CDA), największego w Polsce pisma dla graczy.Pełne wersje gier, newsy, recenzje, zapowiedzi, konkursy, forum, opinie, galerie screenów,trailery, filmiki, patche, teksty. Gry komputerowe (PC) oraz na konsole (PS3, XBOX 360).'
category = 'games'
language = 'pl'
index='http://www.cdaction.pl'

View File

@ -7,17 +7,13 @@ class Computerworld_pl(BasicNewsRecipe):
description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
category = 'IT'
language = 'pl'
masthead_url= 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
no_stylesheets=True
masthead_url = 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
cover_url = 'http://g1.computerworld.pl/cw/beta_gfx/cw2.gif'
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
keep_only_tags=[dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
keep_only_tags = [dict(attrs={'class':['tyt_news', 'prawo', 'autor', 'tresc']})]
remove_tags_after = dict(name='div', attrs={'class':'rMobi'})
remove_tags = [dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.computerworld.pl/')
cover=soup.find(name='img', attrs={'class':'prawo'})
self.cover_url=cover['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -4,11 +4,12 @@ class CoNowegoPl(BasicNewsRecipe):
title = u'conowego.pl'
__author__ = 'fenuks'
description = u'Nowy wortal technologiczny oraz gazeta internetowa. Testy najnowszych produktów, fachowe porady i recenzje. U nas znajdziesz wszystko o elektronice użytkowej !'
cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png'
#cover_url = 'http://www.conowego.pl/fileadmin/templates/main/images/logo_top.png'
category = 'IT, news'
language = 'pl'
oldest_article = 7
max_articles_per_feed = 100
INDEX = 'http://www.conowego.pl/'
no_stylesheets = True
remove_empty_feeds = True
use_embedded_content = False
@ -36,3 +37,10 @@ class CoNowegoPl(BasicNewsRecipe):
for r in appendtag.findAll(attrs={'class':['pages', 'paginationWrap']}):
r.extract()
def get_cover_url(self):
soup = self.index_to_soup('http://www.conowego.pl/magazyn/')
tag = soup.find(attrs={'class':'ms_left'})
if tag:
self.cover_url = self.INDEX + tag.find('img')['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -1,4 +1,5 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
import re
from calibre.web.feeds.news import BasicNewsRecipe
class CzasGentlemanow(BasicNewsRecipe):
@ -13,8 +14,9 @@ class CzasGentlemanow(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
remove_empty_feeds = True
preprocess_regexps = [(re.compile(u'<h3>Może Cię też zainteresować:</h3>'), lambda m: '')]
use_embedded_content = False
keep_only_tags = [dict(name='div', attrs={'class':'content'})]
remove_tags = [dict(attrs={'class':'meta_comments'})]
remove_tags_after = dict(name='div', attrs={'class':'fblikebutton_button'})
remove_tags = [dict(attrs={'class':'meta_comments'}), dict(id=['comments', 'related_posts_thumbnails'])]
remove_tags_after = dict(id='comments')
feeds = [(u'M\u0119ski \u015awiat', u'http://czasgentlemanow.pl/category/meski-swiat/feed/'), (u'Styl', u'http://czasgentlemanow.pl/category/styl/feed/'), (u'Vademecum Gentlemana', u'http://czasgentlemanow.pl/category/vademecum/feed/'), (u'Dom i rodzina', u'http://czasgentlemanow.pl/category/dom-i-rodzina/feed/'), (u'Honor', u'http://czasgentlemanow.pl/category/honor/feed/'), (u'Gad\u017cety Gentlemana', u'http://czasgentlemanow.pl/category/gadzety-gentlemana/feed/')]

View File

@ -18,7 +18,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
max_articles_per_feed = 100
preprocess_regexps = [(re.compile(ur'<div id="\S+360pmp4">Twoja przeglądarka nie obsługuje Flasha i HTML5 lub wyłączono obsługę JavaScript...</div>'), lambda match: '') ]
keep_only_tags=[dict(attrs={'class':['news', 'entry single']})]
remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze')]
remove_tags = [dict(attrs={'class':['newsOptions', 'noPrint', 'komentarze', 'tags font-heading-master']}), dict(id='komentarze'), dict(name='iframe')]
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Dzieje(BasicNewsRecipe):
title = u'dzieje.pl'
__author__ = 'fenuks'
description = 'Dzieje - history of Poland'
description = 'Dzieje.pl - najlepszy portal informacyjno-edukacyjny dotyczący historii Polski XX wieku. Archiwalne fotografie, filmy, katalog postaci, quizy i konkursy.'
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'

View File

@ -4,6 +4,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class eioba(BasicNewsRecipe):
title = u'eioba'
__author__ = 'fenuks'
description = u'eioba.pl - daj się przeczytać!'
cover_url = 'http://www.eioba.org/lay/logo_pl_v3.png'
language = 'pl'
oldest_article = 7

View File

@ -5,7 +5,7 @@ class Elektroda(BasicNewsRecipe):
title = u'Elektroda'
oldest_article = 8
__author__ = 'fenuks'
description = 'Elektroda.pl'
description = 'Międzynarodowy portal elektroniczny udostępniający bogate zasoby z dziedziny elektroniki oraz forum dyskusyjne.'
cover_url = 'http://demotywatory.elektroda.pl/Thunderpic/logo.gif'
category = 'electronics'
language = 'pl'

View File

@ -12,6 +12,7 @@ class eMuzyka(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 100
remove_attributes = ['style']
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]

View File

@ -4,21 +4,21 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup
class FilmWebPl(BasicNewsRecipe):
title = u'FilmWeb'
__author__ = 'fenuks'
description = 'FilmWeb - biggest polish movie site'
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
description = 'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy. Największa baza filmów, seriali i aktorów, repertuar kin i tv, ...'
cover_url = 'http://gfx.filmweb.pl/n/logo-filmweb-bevel.jpg'
category = 'movies'
language = 'pl'
index='http://www.filmweb.pl'
index = 'http://www.filmweb.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
remove_empty_feeds=True
no_stylesheets = True
remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'}
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), ]#(re.compile(ur' | ', re.IGNORECASE), lambda m: '')]
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}'
remove_tags= [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
remove_tags = [dict(name='div', attrs={'class':['recommendOthers']}), dict(name='ul', attrs={'class':'fontSizeSet'}), dict(attrs={'class':'userSurname anno'})]
remove_attributes = ['style',]
keep_only_tags= [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
keep_only_tags = [dict(name='h1', attrs={'class':['hdrBig', 'hdrEntity']}), dict(name='div', attrs={'class':['newsInfo', 'newsInfoSmall', 'reviewContent description']})]
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'),
(u'News / Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'),
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'),

View File

@ -13,7 +13,7 @@ class FocusRecipe(BasicNewsRecipe):
title = u'Focus'
publisher = u'Gruner + Jahr Polska'
category = u'News'
description = u'Newspaper'
description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety'
category = 'magazine'
cover_url = ''
remove_empty_feeds = True

View File

@ -3,6 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Fotoblogia_pl(BasicNewsRecipe):
title = u'Fotoblogia.pl'
__author__ = 'fenuks'
description = u'Jeden z największych polskich blogów o fotografii.'
category = 'photography'
language = 'pl'
masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
@ -11,6 +12,6 @@ class Fotoblogia_pl(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})]
keep_only_tags=[dict(name='div', attrs={'class':['post-view post-standard', 'photo-container']})]
remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]

View File

@ -1,102 +1,91 @@
#!/usr/bin/env python
# # Przed uzyciem przeczytaj komentarz w sekcji "feeds"
__license__ = 'GPL v3'
__copyright__ = u'2010, Richard z forum.eksiazki.org'
'''pomorska.pl'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class GazetaPomorska(BasicNewsRecipe):
title = u'Gazeta Pomorska'
publisher = u'Gazeta Pomorska'
description = u'Kujawy i Pomorze - wiadomo\u015bci'
__author__ = 'Richard z forum.eksiazki.org, fenuks'
description = u'Gazeta Pomorska - portal regionalny'
category = 'newspaper'
language = 'pl'
__author__ = u'Richard z forum.eksiazki.org'
# # (dziekuje t3d z forum.eksiazki.org za testy)
oldest_article = 2
max_articles_per_feed = 20
encoding = 'iso-8859-2'
extra_css = 'ul {list-style: none; padding:0; margin:0;}'
INDEX = 'http://www.pomorska.pl'
masthead_url = INDEX + '/images/top_logo.png'
oldest_article = 7
max_articles_per_feed = 100
remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
preprocess_regexps = [
(re.compile(r'<a href="http://maps.google[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'[<Bb >]*Poznaj opinie[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'[<Bb >]*Przeczytaj[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'[<Bb >]*Wi.cej informacji[^<]*[</Bb >]*[^<]*<a href[^>]*>[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'<a href[^>]*>[<Bb >]*Wideo[^<]*[</Bb >]*[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'<a href[^>]*>[<Bb >]*KLIKNIJ TUTAJ[^<]*[</Bb >]*[^<]*</a>\.*', re.DOTALL|re.IGNORECASE), lambda m: '')
]
ignore_duplicate_articles = {'title', 'url'}
feeds = [
# # Tutaj jest wymieniona lista kategorii jakie mozemy otrzymywac z Gazety
# # Pomorskiej, po jednej kategorii w wierszu. Jesli na poczatku danego wiersza
# # znajduje sie jeden znak "#", oznacza to ze kategoria jest zakomentowana
# # i nie bedziemy jej otrzymywac. Jesli chcemy ja otrzymywac nalezy usunac
# # znak # z jej wiersza.
# # Jesli subskrybujemy wiecej niz jedna kategorie, na koncu wiersza z kazda
# # kategoria musi sie znajdowac niezakomentowany przecinek, z wyjatkiem
# # ostatniego wiersza - ma byc bez przecinka na koncu.
# # Rekomendowane opcje wyboru kategorii:
# # 1. PomorskaRSS - wiadomosci kazdego typu, lub
# # 2. Region + wybrane miasta, lub
# # 3. Wiadomosci tematyczne.
# # Lista kategorii:
preprocess_regexps = [(re.compile(ur'Czytaj:.*?</a>', re.DOTALL), lambda match: ''), (re.compile(ur'Przeczytaj także:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(ur'Przeczytaj również:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(ur'Zobacz też:.*?</a>', re.DOTALL|re.IGNORECASE), lambda match: '')]
# # PomorskaRSS - wiadomosci kazdego typu, zakomentuj znakiem "#"
# # przed odkomentowaniem wiadomosci wybranego typu:
(u'PomorskaRSS', u'http://www.pomorska.pl/rss.xml')
keep_only_tags = [dict(id=['article', 'cover', 'photostory'])]
remove_tags = [dict(id=['articleTags', 'articleMeta', 'boxReadIt', 'articleGalleries', 'articleConnections',
'ForumArticleComments', 'articleRecommend', 'jedynkiLinks', 'articleGalleryConnections',
'photostoryConnections', 'articleEpaper', 'articlePoll', 'articleAlarm', 'articleByline']),
dict(attrs={'class':'articleFunctions'})]
# # wiadomosci z regionu nie przypisane do okreslonego miasta:
# (u'Region', u'http://www.pomorska.pl/region.xml'),
feeds = [(u'Wszystkie', u'http://www.pomorska.pl/rss.xml'),
(u'Region', u'http://www.pomorska.pl/region.xml'),
(u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'),
(u'Nakło', u'http://www.pomorska.pl/naklo.xml'),
(u'Koronowo', u'http://www.pomorska.pl/koronowo.xml'),
(u'Solec Kujawski', u'http://www.pomorska.pl/soleckujawski.xml'),
(u'Grudziądz', u'http://www.pomorska.pl/grudziadz.xml'),
(u'Inowrocław', u'http://www.pomorska.pl/inowroclaw.xml'),
(u'Toruń', u'http://www.pomorska.pl/torun.xml'),
(u'Włocławek', u'http://www.pomorska.pl/wloclawek.xml'),
(u'Aleksandrów Kujawski', u'http://www.pomorska.pl/aleksandrow.xml'),
(u'Brodnica', u'http://www.pomorska.pl/brodnica.xml'),
(u'Chełmno', u'http://www.pomorska.pl/chelmno.xml'),
(u'Chojnice', u'http://www.pomorska.pl/chojnice.xml'),
(u'Ciechocinek', u'http://www.pomorska.pl/ciechocinek.xml'),
(u'Golub-Dobrzyń', u'http://www.pomorska.pl/golubdobrzyn.xml'),
(u'Mogilno', u'http://www.pomorska.pl/mogilno.xml'),
(u'Radziejów', u'http://www.pomorska.pl/radziejow.xml'),
(u'Rypin', u'http://www.pomorska.pl/rypin.xml'),
(u'Sępólno', u'http://www.pomorska.pl/sepolno.xml'),
(u'Świecie', u'http://www.pomorska.pl/swiecie.xml'),
(u'Tuchola', u'http://www.pomorska.pl/tuchola.xml'),
(u'Żnin', u'http://www.pomorska.pl/znin.xml'),
(u'Sport', u'http://www.pomorska.pl/sport.xml'),
(u'Zdrowie', u'http://www.pomorska.pl/zdrowie.xml'),
(u'Auto', u'http://www.pomorska.pl/moto.xml'),
(u'Dom', u'http://www.pomorska.pl/dom.xml'),
#(u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'),
(u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')]
# # wiadomosci przypisane do miast:
# (u'Bydgoszcz', u'http://www.pomorska.pl/bydgoszcz.xml'),
# (u'Nak\u0142o', u'http://www.pomorska.pl/naklo.xml'),
# (u'Koronowo', u'http://www.pomorska.pl/koronowo.xml'),
# (u'Solec Kujawski', u'http://www.pomorska.pl/soleckujawski.xml'),
# (u'Grudzi\u0105dz', u'http://www.pomorska.pl/grudziadz.xml'),
# (u'Inowroc\u0142aw', u'http://www.pomorska.pl/inowroclaw.xml'),
# (u'Toru\u0144', u'http://www.pomorska.pl/torun.xml'),
# (u'W\u0142oc\u0142awek', u'http://www.pomorska.pl/wloclawek.xml'),
# (u'Aleksandr\u00f3w Kujawski', u'http://www.pomorska.pl/aleksandrow.xml'),
# (u'Brodnica', u'http://www.pomorska.pl/brodnica.xml'),
# (u'Che\u0142mno', u'http://www.pomorska.pl/chelmno.xml'),
# (u'Chojnice', u'http://www.pomorska.pl/chojnice.xml'),
# (u'Ciechocinek', u'http://www.pomorska.pl/ciechocinek.xml'),
# (u'Golub Dobrzy\u0144', u'http://www.pomorska.pl/golubdobrzyn.xml'),
# (u'Mogilno', u'http://www.pomorska.pl/mogilno.xml'),
# (u'Radziej\u00f3w', u'http://www.pomorska.pl/radziejow.xml'),
# (u'Rypin', u'http://www.pomorska.pl/rypin.xml'),
# (u'S\u0119p\u00f3lno', u'http://www.pomorska.pl/sepolno.xml'),
# (u'\u015awiecie', u'http://www.pomorska.pl/swiecie.xml'),
# (u'Tuchola', u'http://www.pomorska.pl/tuchola.xml'),
# (u'\u017bnin', u'http://www.pomorska.pl/znin.xml')
def get_cover_url(self):
soup = self.index_to_soup(self.INDEX + '/apps/pbcs.dll/section?Category=JEDYNKI')
nexturl = self.INDEX + soup.find(id='covers').find('a')['href']
soup = self.index_to_soup(nexturl)
self.cover_url = self.INDEX + soup.find(id='cover').find(name='img')['src']
return getattr(self, 'cover_url', self.cover_url)
# # wiadomosci tematyczne (redundancja z region/miasta):
# (u'Sport', u'http://www.pomorska.pl/sport.xml'),
# (u'Zdrowie', u'http://www.pomorska.pl/zdrowie.xml'),
# (u'Auto', u'http://www.pomorska.pl/moto.xml'),
# (u'Dom', u'http://www.pomorska.pl/dom.xml'),
# (u'Reporta\u017c', u'http://www.pomorska.pl/reportaz.xml'),
# (u'Gospodarka', u'http://www.pomorska.pl/gospodarka.xml')
]
def append_page(self, soup, appendtag):
tag = soup.find('span', attrs={'class':'photoNavigationPages'})
if tag:
number = int(tag.string.rpartition('/')[-1].replace('&nbsp;', ''))
baseurl = self.INDEX + soup.find(attrs={'class':'photoNavigationNext'})['href'][:-1]
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
for r in appendtag.findAll(attrs={'class':'photoNavigation'}):
r.extract()
for nr in range(2, number+1):
soup2 = self.index_to_soup(baseurl + str(nr))
pagetext = soup2.find(id='photoContainer')
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoMeta'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
pagetext = soup2.find(attrs={'class':'photoStoryText'})
if pagetext:
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
remove_tags = [
dict(name='p', attrs={'id':'articleTags'}),
dict(name='div', attrs={'id':'articleEpaper'}),
dict(name='div', attrs={'id':'articleConnections'}),
dict(name='div', attrs={'class':'articleFacts'}),
dict(name='div', attrs={'id':'articleExternalLink'}),
dict(name='div', attrs={'id':'articleMultimedia'}),
dict(name='div', attrs={'id':'articleGalleries'}),
dict(name='div', attrs={'id':'articleAlarm'}),
dict(name='div', attrs={'id':'adholder_srodek1'}),
dict(name='div', attrs={'id':'articleVideo'}),
dict(name='a', attrs={'name':'fb_share'})]
extra_css = '''h1 { font-size: 1.4em; }
h2 { font-size: 1.0em; }'''
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
return soup

View File

@ -6,7 +6,7 @@ class Gazeta_Wyborcza(BasicNewsRecipe):
title = u'Gazeta.pl'
__author__ = 'fenuks, Artur Stachecki'
language = 'pl'
description = 'news from gazeta.pl'
description = 'Wiadomości z Polski i ze świata. Serwisy tematyczne i lokalne w 20 miastach.'
category = 'newspaper'
publication_type = 'newspaper'
masthead_url = 'http://bi.gazeta.pl/im/5/10285/z10285445AA.jpg'

View File

@ -11,15 +11,14 @@ class Gram_pl(BasicNewsRecipe):
max_articles_per_feed = 100
ignore_duplicate_articles = {'title', 'url'}
no_stylesheets= True
remove_empty_feeds = True
#extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png'
keep_only_tags= [dict(id='articleModule')]
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter']})]
remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']})]
feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'),
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles'),
(u'Kolektyw- Indie Games', u'http://indie.gram.pl/feed/'),
#(u'Kolektyw- Moto Games', u'http://www.motogames.gram.pl/news.rss')
]
(u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles')
]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)

View File

@ -1,20 +1,23 @@
import time
from calibre.web.feeds.recipes import BasicNewsRecipe
class GryOnlinePl(BasicNewsRecipe):
title = u'Gry-Online.pl'
__author__ = 'fenuks'
description = 'Gry-Online.pl - computer games'
description = u'Wiadomości o grach, recenzje, zapowiedzi. Encyklopedia Gier zawiera opisy gier na PC, konsole Xbox360, PS3 i inne platformy.'
category = 'games'
language = 'pl'
oldest_article = 13
INDEX= 'http://www.gry-online.pl/'
masthead_url='http://www.gry-online.pl/im/gry-online-logo.png'
cover_url='http://www.gry-online.pl/im/gry-online-logo.png'
INDEX = 'http://www.gry-online.pl/'
masthead_url = 'http://www.gry-online.pl/im/gry-online-logo.png'
cover_url = 'http://www.gry-online.pl/im/gry-online-logo.png'
max_articles_per_feed = 100
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['gc660', 'gc660 S013']})]
remove_tags=[dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
feeds = [(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'), ('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'class':['gc660', 'gc660 S013', 'news_endpage_tit', 'news_container', 'news']})]
remove_tags = [dict({'class':['nav-social', 'add-info', 'smlb', 'lista lista3 lista-gry', 'S013po', 'S013-npb', 'zm_gfx_cnt_bottom', 'ocen-txt', 'wiecej-txt', 'wiecej-txt2']})]
feeds = [
(u'Newsy', 'http://www.gry-online.pl/rss/news.xml'),
('Teksty', u'http://www.gry-online.pl/rss/teksty.xml')]
def append_page(self, soup, appendtag):
@ -24,7 +27,14 @@ class GryOnlinePl(BasicNewsRecipe):
url_part = soup.find('link', attrs={'rel':'canonical'})['href']
url_part = url_part[25:].rpartition('?')[0]
for nexturl in nexturls[1:-1]:
soup2 = self.index_to_soup('http://www.gry-online.pl/' + url_part + nexturl['href'])
finalurl = 'http://www.gry-online.pl/' + url_part + nexturl['href']
for i in range(10):
try:
soup2 = self.index_to_soup(finalurl)
break
except:
print 'retrying in 0.5s'
time.sleep(0.5)
pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'):
r.extract()
@ -34,7 +44,42 @@ class GryOnlinePl(BasicNewsRecipe):
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry']}):
r.extract()
else:
tag = appendtag.find('div', attrs={'class':'S018stronyr'})
if tag:
nexturl = tag.a
url_part = soup.find('link', attrs={'rel':'canonical'})['href']
url_part = url_part[25:].rpartition('?')[0]
while tag:
end = tag.find(attrs={'class':'right left-dead'})
if end:
break
else:
nexturl = tag.a
finalurl = 'http://www.gry-online.pl/' + url_part + nexturl['href']
for i in range(10):
try:
soup2 = self.index_to_soup(finalurl)
break
except:
print 'retrying in 0.5s'
time.sleep(0.5)
tag = soup2.find('div', attrs={'class':'S018stronyr'})
pagetext = soup2.find(attrs={'class':'gc660'})
for r in pagetext.findAll(name='header'):
r.extract()
for r in pagetext.findAll(attrs={'itemprop':'description'}):
r.extract()
pos = len(appendtag.contents)
appendtag.insert(pos, pagetext)
for r in appendtag.findAll(attrs={'class':['n5p', 'add-info', 'twitter-share-button', 'lista lista3 lista-gry', 'S018strony']}):
r.extract()
def image_url_processor(self, baseurl, url):
if url.startswith('..'):
return url[2:]
else:
return url
def preprocess_html(self, soup):
self.append_page(soup, soup.body)

Binary file not shown.

After

Width:  |  Height:  |  Size: 760 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 762 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 834 B

View File

@ -7,7 +7,7 @@ class Konflikty(BasicNewsRecipe):
__author__ = 'fenuks'
cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg'
language = 'pl'
description ='military news'
description = u'Zbiór ciekawych artykułów historycznych, militarnych oraz recenzji książek, gier i filmów. Najświeższe informacje o lotnictwie, wojskach lądowych i polityce.'
category='military, history'
oldest_article = 7
max_articles_per_feed = 100

View File

@ -7,7 +7,7 @@ class Kosmonauta(BasicNewsRecipe):
description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.'
category = 'astronomy'
language = 'pl'
cover_url='http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg'
no_stylesheets = True
INDEX = 'http://www.kosmonauta.net'
oldest_article = 7
@ -24,6 +24,5 @@ class Kosmonauta(BasicNewsRecipe):
href = a['href']
if not href.startswith('http'):
a['href'] = self.INDEX + href
print '%%%%%%%%%%%%%%%%%%%%%%%%%', a['href']
return soup

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Lomza(BasicNewsRecipe):
title = u'4Lomza'
__author__ = 'fenuks'
description = u'4Łomża - regional site'
description = u'Regionalny portal. Najświeższe informacje z regionu, kulturalne, sportowe. Ogłoszenia, baza biznesu, forum.'
cover_url = 'http://www.4lomza.pl/i/logo4lomza_m.jpg'
language = 'pl'
oldest_article = 15

View File

@ -7,7 +7,7 @@ class Mlody_technik(BasicNewsRecipe):
description = u'Młody technik'
category = 'science'
language = 'pl'
cover_url='http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
#cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg'
no_stylesheets = True
preprocess_regexps = [(re.compile(r"<h4>Podobne</h4>", re.IGNORECASE), lambda m: '')]
oldest_article = 7
@ -18,10 +18,17 @@ class Mlody_technik(BasicNewsRecipe):
remove_tags = [dict(attrs={'class':'st-related-posts'})]
remove_tags_after = dict(attrs={'class':'entry-content clearfix'})
feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'),
(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'),
#(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'),
(u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'),
(u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'),
(u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'),
(u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'),
(u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'),
(u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')]
def get_cover_url(self):
soup = self.index_to_soup('http://www.mt.com.pl/')
tag = soup.find(attrs={'class':'xoxo'})
if tag:
self.cover_url = tag.find('img')['src']
return getattr(self, 'cover_url', self.cover_url)

View File

@ -9,8 +9,8 @@ class Niebezpiecznik_pl(BasicNewsRecipe):
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets = True
cover_url =u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
remove_tags=[dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
keep_only_tags= [dict(name='div', attrs={'class':['title', 'entry']})]
cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png'
remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})]
keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})]
feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'),
('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')]

View File

@ -9,7 +9,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
__modified_by__ = 'zaslav'
language = 'pl'
encoding='latin2'
description ='site for fantasy readers'
description = u'Strona dla miłośników fantastyki'
category='fantasy'
masthead_url='http://farm5.static.flickr.com/4133/4956658792_7ba7fbf562.jpg'
#extra_css='.tytul {font-size: 20px;}' #not working

View File

@ -7,12 +7,12 @@ class PC_Foster(BasicNewsRecipe):
description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.'
category = 'IT'
language = 'pl'
masthead_url='http://pcfoster.pl/public/images/logo.png'
cover_url= 'http://pcfoster.pl/public/images/logo.png'
no_stylesheets= True
remove_empty_feeds= True
keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})]
remove_tags=[dict(name='p', attrs={'class':'right'})]
masthead_url = 'http://pcfoster.pl/public/images/logo.png'
cover_url = 'http://pcfoster.pl/public/images/logo.png'
no_stylesheets = True
remove_empty_feeds = True
keep_only_tags = [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})]
remove_tags = [dict(name='p', attrs={'class':'right'})]
feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')]

View File

@ -7,9 +7,11 @@ class PolskaTimes(BasicNewsRecipe):
language = 'pl'
masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17'
oldest_article = 7
encoding = 'iso-8859-2'
max_articles_per_feed = 100
remove_emty_feeds= True
remove_empty_feeds = True
no_stylesheets = True
use_embedded_content = False
ignore_duplicate_articles = {'title', 'url'}
#preprocess_regexps = [(re.compile(ur'<b>Czytaj także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur',<b>Czytaj też:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>Zobacz także:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<center><h4><a.*?</a></h4></center>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TEŻ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ WIĘCEJ:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>CZYTAJ TAKŻE:.*?</b>', re.DOTALL), lambda match: ''), (re.compile(ur'<b>\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: '</body>'), (re.compile(ur'<b>Nasze serwisy:</b>.*', re.DOTALL), lambda match: '</body>') ]
remove_tags_after= dict(attrs={'src':'http://nm.dz.com.pl/dz.png'})

View File

@ -4,7 +4,7 @@ class SpidersWeb(BasicNewsRecipe):
title = u"Spider's Web"
oldest_article = 7
__author__ = 'fenuks'
description = u'Opinie i analizy na temat technologii'
description = u'Autorskie teksty popularnych blogerów, testy sprzętu i aplikacji, oraz wiele więcej.'
cover_url = 'http://www.spidersweb.pl/wp-content/themes/new_sw/images/spidersweb.png'
category = 'IT, WEB'
language = 'pl'

View File

@ -3,7 +3,7 @@ import re
class Tablety_pl(BasicNewsRecipe):
title = u'Tablety.pl'
__author__ = 'fenuks'
description = u'tablety.pl - latest tablet news'
description = u'Tablety, gry i aplikacje na tablety.'
masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png'
category = 'IT'

View File

@ -4,6 +4,7 @@ class tanuki(BasicNewsRecipe):
title = u'Tanuki'
oldest_article = 7
__author__ = 'fenuks'
description = u'Tanuki - portal o anime i mandze.'
category = 'anime, manga'
language = 'pl'
max_articles_per_feed = 100

View File

@ -8,8 +8,8 @@ class tvn24(BasicNewsRecipe):
description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata'
category = 'news'
language = 'pl'
masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
#masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif'
cover_url= 'http://www.qzdrowiu.pl/Upload/KnowQZdrowiu_PressOffice/TVN24_logo_575702b7-edce-4b6f-a41b-4395f9456f96_ff6d6ccf-528a-4b94-9e61-2fed727aba35.png'
extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}'
remove_empty_feeds = True
remove_javascript = True

View File

@ -3,7 +3,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Ubuntu_pl(BasicNewsRecipe):
title = u'UBUNTU.pl'
__author__ = 'fenuks'
description = 'UBUNTU.pl - polish ubuntu community site'
description = 'Polskie forum użytkowników Ubuntu Linux. Projekty, porady i dyskusje, gotowe rozwiązania problemów.'
masthead_url= 'http://ubuntu.pl/img/logo.jpg'
cover_url = 'http://ubuntu.pl/img/logo.jpg'
category = 'linux, IT'