diff --git a/recipes/ciekawostki_historyczne.recipe b/recipes/ciekawostki_historyczne.recipe new file mode 100644 index 0000000000..7c5138196d --- /dev/null +++ b/recipes/ciekawostki_historyczne.recipe @@ -0,0 +1,48 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Ciekawostki_Historyczne(BasicNewsRecipe): + title = u'Ciekawostki Historyczne' + oldest_article = 7 + __author__ = 'fenuks' + description = u'Serwis popularnonaukowy - odkrycia, kontrowersje, historia, ciekawostki, badania, ciekawostki z przeszłości.' + category = 'history' + language = 'pl' + masthead_url= 'http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + cover_url='http://ciekawostkihistoryczne.pl/wp-content/themes/Wordpress_Magazine/images/logo-ciekawostki-historyczne-male.jpg' + max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(ur'Ten artykuł ma kilka stron.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:

.*?', re.DOTALL), lambda match: '')] + no_stylesheets=True + remove_empty_feeds=True + keep_only_tags=[dict(name='div', attrs={'class':'post'})] + remove_tags=[dict(id='singlepostinfo')] + feeds = [(u'Staro\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/starozytnosc/feed/'), (u'\u015aredniowiecze', u'http://ciekawostkihistoryczne.pl/tag/sredniowiecze/feed/'), (u'Nowo\u017cytno\u015b\u0107', u'http://ciekawostkihistoryczne.pl/tag/nowozytnosc/feed/'), (u'XIX wiek', u'http://ciekawostkihistoryczne.pl/tag/xix-wiek/feed/'), (u'1914-1939', u'http://ciekawostkihistoryczne.pl/tag/1914-1939/feed/'), (u'1939-1945', u'http://ciekawostkihistoryczne.pl/tag/1939-1945/feed/'), (u'Powojnie (od 1945)', u'http://ciekawostkihistoryczne.pl/tag/powojnie/feed/'), (u'Recenzje', u'http://ciekawostkihistoryczne.pl/category/recenzje/feed/')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='h7') + if tag: + if tag.br: + pass + elif tag.nextSibling.name=='p': + tag=tag.nextSibling + nexturl = tag.findAll('a') + for nextpage in nexturl: + tag.extract() + nextpage= nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(name='div', attrs={'class':'post'}) + for r in pagetext.findAll('div', attrs={'id':'singlepostinfo'}): + r.extract() + for r in pagetext.findAll('div', attrs={'class':'wp-caption alignright'}): + r.extract() + for r in pagetext.findAll('h1'): + r.extract() + pagetext.find('h6').nextSibling.extract() + pagetext.find('h7').nextSibling.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup + + \ No newline at end of file diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe new file mode 100644 index 0000000000..f3384263d6 --- /dev/null +++ b/recipes/gameplay_pl.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Gameplay_pl(BasicNewsRecipe): + title = u'Gameplay.pl' + oldest_article = 7 + __author__ = 'fenuks' + description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.' + category = 'games, movies, books, music' + language = 'pl' + masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png' + cover_url= 'http://gameplay.pl/img/gpy_top_logo.png' + max_articles_per_feed = 100 + no_stylesheets= True + keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})] + remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})] + feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')] + + def image_url_processor(self, baseurl, url): + if 'http' not in url: + return 'http://gameplay.pl'+ url[2:] + else: + return url diff --git a/recipes/icons/ciekawostki_historyczne.png b/recipes/icons/ciekawostki_historyczne.png new file mode 100644 index 0000000000..fa0e2c0591 Binary files /dev/null and b/recipes/icons/ciekawostki_historyczne.png differ diff --git a/recipes/icons/gameplay_pl.png b/recipes/icons/gameplay_pl.png new file mode 100644 index 0000000000..1b7081f393 Binary files /dev/null and b/recipes/icons/gameplay_pl.png differ diff --git a/recipes/icons/in4_pl.png b/recipes/icons/in4_pl.png new file mode 100644 index 0000000000..b3351629f0 Binary files /dev/null and b/recipes/icons/in4_pl.png differ diff --git a/recipes/icons/informacje_usa.png b/recipes/icons/informacje_usa.png new file mode 100644 index 0000000000..4c30e3bcbc Binary files /dev/null and b/recipes/icons/informacje_usa.png differ diff --git a/recipes/icons/kresy_pl.png b/recipes/icons/kresy_pl.png new file mode 100644 index 0000000000..db8ef4efec Binary files /dev/null and b/recipes/icons/kresy_pl.png differ diff --git a/recipes/icons/oclab_pl.png b/recipes/icons/oclab_pl.png new file mode 100644 index 0000000000..45ecd2533e Binary files /dev/null and b/recipes/icons/oclab_pl.png differ diff --git a/recipes/icons/overclock_pl.png b/recipes/icons/overclock_pl.png new file mode 100644 index 0000000000..38c0b13bfe Binary files /dev/null and b/recipes/icons/overclock_pl.png differ diff --git a/recipes/icons/palmtop_pl.png b/recipes/icons/palmtop_pl.png new file mode 100644 index 0000000000..d711a41682 Binary files /dev/null and b/recipes/icons/palmtop_pl.png differ diff --git a/recipes/icons/pc_arena.png b/recipes/icons/pc_arena.png new file mode 100644 index 0000000000..10be204b36 Binary files /dev/null and b/recipes/icons/pc_arena.png differ diff --git a/recipes/icons/pc_centre_pl.png b/recipes/icons/pc_centre_pl.png new file mode 100644 index 0000000000..e2fbf1eefb Binary files /dev/null and b/recipes/icons/pc_centre_pl.png differ diff --git a/recipes/icons/pc_foster.png b/recipes/icons/pc_foster.png new file mode 100644 index 0000000000..433970bcc1 Binary files /dev/null and b/recipes/icons/pc_foster.png differ diff --git a/recipes/icons/polska_times.png b/recipes/icons/polska_times.png new file mode 100644 index 0000000000..f233f45518 Binary files /dev/null and b/recipes/icons/polska_times.png differ diff --git a/recipes/icons/pure_pc.png b/recipes/icons/pure_pc.png new file mode 100644 index 0000000000..e5e102eee7 Binary files /dev/null and b/recipes/icons/pure_pc.png differ diff --git a/recipes/icons/tanuki.png b/recipes/icons/tanuki.png new file mode 100644 index 0000000000..fe46d7e8dc Binary files /dev/null and b/recipes/icons/tanuki.png differ diff --git a/recipes/icons/tvn24.png b/recipes/icons/tvn24.png new file mode 100644 index 0000000000..864a6624ac Binary files /dev/null and b/recipes/icons/tvn24.png differ diff --git a/recipes/icons/webhosting_pl.png b/recipes/icons/webhosting_pl.png new file mode 100644 index 0000000000..0e11a3065e Binary files /dev/null and b/recipes/icons/webhosting_pl.png differ diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe new file mode 100644 index 0000000000..16ad622b46 --- /dev/null +++ b/recipes/in4_pl.recipe @@ -0,0 +1,44 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class in4(BasicNewsRecipe): + title = u'IN4.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Serwis Informacyjny - Aktualnosci, recenzje' + category = 'IT' + language = 'pl' + #cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'

Zobacz też:.*?

', re.DOTALL), lambda match: '')] + keep_only_tags=[dict(name='div', attrs={'class':'box box-single'})] + remove_tags_after= dict(attrs={'class':'tags'}) + remove_tags= [dict(attrs={'class':['postmetadata', 'tags', 'banner']}), dict(name='a', attrs={'title':['Drukuj', u'Wyślij']})] + feeds = [(u'Informacje', u'http://www.informacjeusa.com/feed/')] diff --git a/recipes/kresy_pl.recipe b/recipes/kresy_pl.recipe new file mode 100644 index 0000000000..3dfc2c057c --- /dev/null +++ b/recipes/kresy_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class Kresy(BasicNewsRecipe): + title = u'Kresy' + __author__ = 'fenuks' + description = u'portal społeczności kresowej' + language = 'pl' + masthead_url= 'http://www.kresy.pl/public/img/logo.png' + cover_url= 'http://www.kresy.pl/public/img/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + keep_only_tags= [dict(id='artykul')] + remove_tags= [dict(attrs={'class':['twitter-share-button', 'likefbborder', 'tagi']})] + feeds = [(u'Wszystkie', u'http://www.kresy.pl/rss')] diff --git a/recipes/oclab_pl.recipe b/recipes/oclab_pl.recipe new file mode 100644 index 0000000000..b0df89ba72 --- /dev/null +++ b/recipes/oclab_pl.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class OCLab(BasicNewsRecipe): + title = u'OCLab.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal OCLab.pl jest miejscem przyjaznym pasjonatom sprzętu komputerowego, w szczególności overclockerom, które będzie służyć im za aktualną bazę wiedzy o podkręcaniu komputera, źródło aktualnych informacji z rynku oraz opinii na temat sprzętu komputerowego.' + category = 'IT' + language = 'pl' + cover_url= 'http://www.idealforum.ru/attachment.php?attachmentid=7963&d=1316008118' + no_stylesheets = True + keep_only_tags=[dict(id='main')] + remove_tags_after= dict(attrs={'class':'single-postmetadata'}) + remove_tags=[dict(attrs={'class':['single-postmetadata', 'pagebar']})] + feeds = [(u'Wpisy', u'http://oclab.pl/feed/')] + + + def append_page(self, soup, appendtag): + tag=soup.find(attrs={'class':'contentjumpddl'}) + if tag: + nexturl=tag.findAll('option') + for nextpage in nexturl[1:-1]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(attrs={'class':'single-entry'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'post-nav-bottom-list'}): + r.extract() + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup diff --git a/recipes/overclock_pl.recipe b/recipes/overclock_pl.recipe new file mode 100644 index 0000000000..d7f4c8093d --- /dev/null +++ b/recipes/overclock_pl.recipe @@ -0,0 +1,37 @@ +import re +from calibre.web.feeds.news import BasicNewsRecipe +class Overclock_pl(BasicNewsRecipe): + title = u'Overclock.pl' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal poświęcony tematyce hardware, kładący największy nacisk na podkręcanie / overclocking (włącznie z extreme) i chłodzenie / cooling (air cooling, water cooling, freon cooling, dry ice, liquid nitrogen).' + category = 'IT' + language = 'pl' + masthead_url='http://www.overclock.pl/gfx/logo_m.png' + cover_url='http://www.overclock.pl/gfx/logo_m.png' + no_stylesheets = True + remove_empty_feeds = True + preprocess_regexps = [(re.compile(ur'Komentarze do aktualności:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

Nawigacja

', re.DOTALL), lambda match: '') ] + keep_only_tags=[dict(name='div', attrs={'class':'news'}), dict(id='articleContent')] + remove_tags=[dict(name='span', attrs={'class':'info'}), dict(attrs={'class':'shareit'})] + feeds = [(u'Aktualno\u015bci', u'http://www.overclock.pl/rss.news.xml'), (u'Testy i recenzje', u'http://www.overclock.pl/rss.articles.xml')] + + + def append_page(self, soup, appendtag): + tag=soup.find(id='navigation') + if tag: + nexturl=tag.findAll('option') + tag.extract() + for nextpage in nexturl[2:]: + soup2 = self.index_to_soup(nextpage['value']) + pagetext = soup2.find(id='content') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + rem=appendtag.find(attrs={'alt':'Pierwsza'}) + if rem: + rem.parent.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/palmtop_pl.recipe b/recipes/palmtop_pl.recipe new file mode 100644 index 0000000000..ace772e7e7 --- /dev/null +++ b/recipes/palmtop_pl.recipe @@ -0,0 +1,14 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class palmtop_pl(BasicNewsRecipe): + title = u'Palmtop.pl' + __author__ = 'fenuks' + description = 'wortal technologii mobilnych' + category = 'mobile' + language = 'pl' + cover_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + masthead_url='http://cdn.g-point.biz/wp-content/themes/palmtop-new/images/header_palmtop_logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + + feeds = [(u'Newsy', u'http://palmtop.pl/feed/atom/')] diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe new file mode 100644 index 0000000000..faefeb25c0 --- /dev/null +++ b/recipes/pc_arena.recipe @@ -0,0 +1,31 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Arena(BasicNewsRecipe): + title = u'PCArena' + oldest_article = 18300 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.' + category = 'IT' + language = 'pl' + masthead_url='http://pcarena.pl/public/design/frontend/images/logo.gif' + cover_url= 'http://pcarena.pl/public/design/frontend/images/logo.gif' + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['artHeader', 'art']})] + remove_tags=[dict(attrs={'class':'pages'})] + feeds = [(u'Newsy', u'http://pcarena.pl/misc/rss/news'), (u'Artyku\u0142y', u'http://pcarena.pl/misc/rss/articles')] + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pagNum'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[1:]: + nextpage= 'http://pcarena.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(attrs={'class':'artBody'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe new file mode 100644 index 0000000000..68a17888ce --- /dev/null +++ b/recipes/pc_centre_pl.recipe @@ -0,0 +1,41 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Centre(BasicNewsRecipe): + title = u'PC Centre' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' + category = 'IT' + language = 'pl' + masthead_url= 'http://pccentre.pl/views/images/logo.gif' + cover_url= 'http://pccentre.pl/views/images/logo.gif' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] + feeds = [(u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + + + def append_page(self, soup, appendtag): + tag=soup.find(name='div', attrs={'class':'pages'}) + if tag: + nexturl=tag.findAll('a') + tag.extract() + for nextpage in nexturl[:-1]: + nextpage= 'http://pccentre.pl' + nextpage['href'] + soup2 = self.index_to_soup(nextpage) + pagetext = soup2.find(id='content') + rem=pagetext.findAll(attrs={'class':['subtitle', 'content_info', 'list_of_content', 'pages', 'social2', 'pcc_acc', 'pcc_acc_na']}) + for r in rem: + r.extract() + rem=pagetext.findAll(id='comments') + for r in rem: + r.extract() + rem=pagetext.findAll('h1') + for r in rem: + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/pc_foster.recipe b/recipes/pc_foster.recipe new file mode 100644 index 0000000000..ab8c2b66b1 --- /dev/null +++ b/recipes/pc_foster.recipe @@ -0,0 +1,35 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PC_Foster(BasicNewsRecipe): + title = u'PC Foster' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Vortal technologiczny: testy, recenzje sprzętu komputerowego i telefonów, nowinki hardware, programy i gry dla Windows. Podkręcanie, modding i Overclocking.' + category = 'IT' + language = 'pl' + masthead_url='http://pcfoster.pl/public/images/logo.png' + cover_url= 'http://pcfoster.pl/public/images/logo.png' + no_stylesheets= True + remove_empty_feeds= True + keep_only_tags= [dict(id=['news_details', 'review_details']), dict(attrs={'class':'pager more_top'})] + remove_tags=[dict(name='p', attrs={'class':'right'})] + feeds = [(u'G\u0142\xf3wny', u'http://pcfoster.pl/public/rss/main.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'alt':u'Następna strona'}) + if nexturl: + appendtag.find(attrs={'class':'pager more_top'}).extract() + while nexturl: + nexturl='http://pcfoster.pl' + nexturl.parent['href'] + soup2 = self.index_to_soup(nexturl) + nexturl=soup2.find(attrs={'alt':u'Następna strona'}) + pagetext = soup2.find(attrs={'class':'content'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'review_content double'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/polska_times.recipe b/recipes/polska_times.recipe new file mode 100644 index 0000000000..4126576fe2 --- /dev/null +++ b/recipes/polska_times.recipe @@ -0,0 +1,81 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class Polska_times(BasicNewsRecipe): + title = u'Polska Times' + __author__ = 'fenuks' + description = u'Internetowe wydanie dziennika ogólnopolskiego Polska The Times. Najświeższe informacje: wydarzenia w kraju i na świecie, reportaże, poradniki, opinie.' + category = 'newspaper' + language = 'pl' + masthead_url = 'http://s.polskatimes.pl/g/logo_naglowek/polska.gif?17' + oldest_article = 7 + max_articles_per_feed = 100 + remove_emty_feeds= True + no_stylesheets = True + preprocess_regexps = [(re.compile(ur'Czytaj także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur',Czytaj też:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'Zobacz także:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TEŻ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ WIĘCEJ:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'CZYTAJ TAKŻE:.*?', re.DOTALL), lambda match: ''), (re.compile(ur'\* CZYTAJ KONIECZNIE:.*', re.DOTALL), lambda match: ''), (re.compile(ur'Nasze serwisy:.*', re.DOTALL), lambda match: '') ] + keep_only_tags= [dict(id=['tytul-artykulu', 'kontent'])] + remove_tags_after= dict(id='material-tagi') + remove_tags=[dict(attrs={'id':'reklama_srodtekst_0'}), dict(attrs={'id':'material-tagi'}), dict(name='div', attrs={'class':'zakladki'}), dict(attrs={'title':u'CZYTAJ TAKŻE'}), dict(attrs={'id':'podobne'}), dict(name='a', attrs={'href':'http://www.dzienniklodzki.pl/newsletter'})] + feeds = [(u'Fakty', u'http://polskatimes.feedsportal.com/c/32980/f/533648/index.rss'), (u'Opinie', u'http://www.polskatimes.pl/rss/opinie.xml'), (u'Sport', u'http://polskatimes.feedsportal.com/c/32980/f/533649/index.rss'), (u'Pieni\u0105dze', u'http://polskatimes.feedsportal.com/c/32980/f/533657/index.rss'), (u'Twoje finanse', u'http://www.polskatimes.pl/rss/twojefinanse.xml'), (u'Kultura', u'http://polskatimes.feedsportal.com/c/32980/f/533650/index.rss'), (u'Dodatki', u'http://www.polskatimes.pl/rss/dodatki.xml')] + + def skip_ad_pages(self, soup): + if 'Advertisement' in soup.title: + nexturl=soup.find('a')['href'] + return self.index_to_soup(nexturl, raw=True) + + def append_page(self, soup, appendtag): + nexturl=soup.find(id='nastepna_strona') + while nexturl: + soup2= self.index_to_soup(nexturl['href']) + nexturl=soup2.find(id='nastepna_strona') + pagetext = soup2.find(id='tresc') + for dictionary in self.remove_tags: + v=pagetext.findAll(attrs=dictionary['attrs']) + for delete in v: + delete.extract() + for b in pagetext.findAll(name='b'): + if b.string: + if u'CZYTAJ TEŻ' in b.string or u'Czytaj także' in b.string or u'Czytaj też' in b.string or u'Zobacz także' in b.string: + b.extract() + for center in pagetext.findAll(name='center'): + if center.h4: + if center.h4.a: + center.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def image_article(self, soup, appendtag): + nexturl=soup.find('a', attrs={'class':'nastepna'}) + urls=[] + while nexturl: + if nexturl not in urls: + urls.append(nexturl) + else: + break + soup2= self.index_to_soup('http://www.polskatimes.pl/artykul/' + nexturl['href']) + nexturl=soup2.find('a', attrs={'class':'nastepna'}) + if nexturl in urls: + break; + pagetext = soup2.find(id='galeria-material') + pos = len(appendtag.contents) + appendtag.insert(pos, '
') + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for rem in appendtag.findAll(attrs={'class':['galeriaNawigator', 'miniaturyPojemnik']}): + rem.extract() + for paginator in appendtag.findAll(attrs={'class':'stronicowanie'}): + paginator.extract() + + def preprocess_html(self, soup): + if soup.find('a', attrs={'class':'nastepna'}): + self.image_article(soup, soup.body) + elif soup.find(id='nastepna_strona'): + self.append_page(soup, soup.body) + return soup + + + def get_cover_url(self): + soup = self.index_to_soup('http://www.prasa24.pl/gazeta/metropolia-warszawska/') + self.cover_url=soup.find(id='pojemnik').img['src'] + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/pure_pc.recipe b/recipes/pure_pc.recipe new file mode 100644 index 0000000000..7a6c43bb7e --- /dev/null +++ b/recipes/pure_pc.recipe @@ -0,0 +1,33 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class PurePC(BasicNewsRecipe): + title = u'PurePC' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Artykuły, aktualności, sprzęt, forum, chłodzenie, modding, urządzenia mobilne - wszystko w jednym miejscu.' + category = 'IT' + language = 'pl' + masthead_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + cover_url= 'http://www.purepc.pl/themes/new/images/purepc.jpg' + no_stylesheets = True + keep_only_tags= [dict(id='content')] + remove_tags_after= dict(attrs={'class':'fivestar-widget'}) + remove_tags= [dict(id='navigator'), dict(attrs={'class':['box-tools', 'fivestar-widget', 'PageMenuList']})] + feeds = [(u'Wiadomo\u015bci', u'http://www.purepc.pl/node/feed')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'pager-next'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://www.purepc.pl'+ nexturl.a['href']) + nexturl=soup2.find(attrs={'class':'pager-next'}) + pagetext = soup2.find(attrs={'class':'article'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':['PageMenuList', 'pager', 'fivestar-widget']}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe new file mode 100644 index 0000000000..666cb8aa77 --- /dev/null +++ b/recipes/tanuki.recipe @@ -0,0 +1,37 @@ +from calibre.web.feeds.news import BasicNewsRecipe +import re +class tanuki(BasicNewsRecipe): + title = u'Tanuki' + oldest_article = 7 + __author__ = 'fenuks' + category = 'anime, manga' + language = 'pl' + max_articles_per_feed = 100 + encoding='utf-8' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} .kadr{float: left;} .dwazdania {float: right;}' + preprocess_regexps = [(re.compile(ur'

', re.DOTALL), lambda match: ''), (re.compile(ur'
Zobacz jak ocenili
', re.DOTALL), lambda match: '')] + remove_empty_feeds= True + no_stylesheets = True + keep_only_tags=[dict(attrs={'class':['animename', 'storyname', 'nextarrow','sideinfov', 'sidelinfov', 'sideinfo', 'sidelinfo']}), dict(name='table', attrs={'summary':'Technikalia'}), dict(attrs={'class':['chaptername','copycat']}), dict(id='rightcolumn'), dict(attrs={'class':['headn_tt', 'subtable']})] + remove_tags=[dict(name='div', attrs={'class':'screen'}), dict(id='randomtoplist'), dict(attrs={'class':'note'})] + feeds = [(u'Anime', u'http://anime.tanuki.pl/rss_anime.xml'), (u'Manga', u'http://manga.tanuki.pl/rss_manga.xml'), (u'Tomiki', u'http://manga.tanuki.pl/rss_mangabooks.xml'), (u'Artyku\u0142y', u'http://czytelnia.tanuki.pl/rss_czytelnia_artykuly.xml'), (u'Opowiadania', u'http://czytelnia.tanuki.pl/rss_czytelnia.xml')] + + + def append_page(self, soup, appendtag): + nexturl= appendtag.find(attrs={'class':'nextarrow'}) + if nexturl: + while nexturl: + soup2 = self.index_to_soup('http://czytelnia.tanuki.pl'+ nexturl['href']) + nexturl=soup2.find(attrs={'class':'nextarrow'}) + pagetext = soup2.find(attrs={'class':['chaptername', 'copycat']}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + pagetext = soup2.find(attrs={'class':'copycat'}) + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class':'nextarrow'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/tvn24.recipe b/recipes/tvn24.recipe new file mode 100644 index 0000000000..5d3791efb0 --- /dev/null +++ b/recipes/tvn24.recipe @@ -0,0 +1,24 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class tvn24(BasicNewsRecipe): + title = u'TVN24' + oldest_article = 7 + max_articles_per_feed = 100 + __author__ = 'fenuks' + description = u'Sport, Biznes, Gospodarka, Informacje, Wiadomości Zawsze aktualne wiadomości z Polski i ze świata' + category = 'news' + language = 'pl' + masthead_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' + cover_url= 'http://www.tvn24.pl/_d/topmenu/logo2.gif' + extra_css= 'ul {list-style: none; padding: 0; margin: 0;} li {float: left;margin: 0 0.15em;}' + remove_empty_feeds = True + remove_javascript = True + no_stylesheets = True + keep_only_tags=[dict(id='tvn24_wiadomosci_detal'), dict(name='h1', attrs={'class':'standardHeader1'}), dict(attrs={'class':['date60m rd5', 'imageBackground fl rd7', 'contentFromCMS']})] + remove_tags_after= dict(name='div', attrs={'class':'socialBoxesBottom'}) + remove_tags=[dict(attrs={'class':['tagi_detal', 'socialBoxesBottom', 'twitterBox', 'commentsInfo', 'textSize', 'obj_ukrytydruk obj_ramka1_r', 'related newsNews align-right', 'box', 'newsUserList', 'watchMaterial text']})] + feeds = [(u'Najnowsze', u'http://www.tvn24.pl/najnowsze.xml'), (u'Polska', u'www.tvn24.pl/polska.xml'), (u'\u015awiat', u'http://www.tvn24.pl/swiat.xml'), (u'Sport', u'http://www.tvn24.pl/sport.xml'), (u'Biznes', u'http://www.tvn24.pl/biznes.xml'), (u'Meteo', u'http://www.tvn24.pl/meteo.xml'), (u'Micha\u0142ki', u'http://www.tvn24.pl/michalki.xml'), (u'Kultura', u'http://www.tvn24.pl/kultura.xml')] + + def preprocess_html(self, soup): + for item in soup.findAll(style=True): + del item['style'] + return soup diff --git a/recipes/webhosting_pl.recipe b/recipes/webhosting_pl.recipe new file mode 100644 index 0000000000..aeb98477f3 --- /dev/null +++ b/recipes/webhosting_pl.recipe @@ -0,0 +1,39 @@ +from calibre.web.feeds.news import BasicNewsRecipe +class webhosting_pl(BasicNewsRecipe): + title = u'Webhosting.pl' + __author__ = 'fenuks' + description = 'Webhosting.pl to pierwszy na polskim rynku serwis poruszający w szerokim aspekcie tematy związane z hostingiem, globalną Siecią i usługami internetowymi. Głównym celem przedsięwzięcia jest dostarczanie przydatnej i bogatej merytorycznie wiedzy osobom, które chcą tworzyć i efektywnie wykorzystywać współczesny Internet.' + category = 'web' + language = 'pl' + cover_url='http://webhosting.pl/images/logo.png' + masthead_url='http://webhosting.pl/images/logo.png' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + #keep_only_tags= [dict(name='div', attrs={'class':'content_article'}), dict(attrs={'class':'paging'})] + #remove_tags=[dict(attrs={'class':['tags', 'wykop', 'facebook_button_count', 'article_bottom']})] + feeds = [(u'Newsy', u'http://webhosting.pl/feed/rss/an'), + (u'Artyku\u0142y', u'http://webhosting.pl/feed/rss/aa'), + (u'Software', u'http://webhosting.pl/feed/rss/n/12'), + (u'Internet', u'http://webhosting.pl/feed/rss/n/9'), + (u'Biznes', u'http://webhosting.pl/feed/rss/n/13'), + (u'Bezpiecze\u0144stwo', u'http://webhosting.pl/feed/rss/n/10'), + (u'Blogi', u'http://webhosting.pl/feed/rss/ab'), + (u'Programowanie', u'http://webhosting.pl/feed/rss/n/8'), + (u'Kursy', u'http://webhosting.pl/feed/rss/n/11'), + (u'Tips&Tricks', u'http://webhosting.pl/feed/rss/n/15'), + (u'Imprezy', u'http://webhosting.pl/feed/rss/n/22'), + (u'Wywiady', u'http://webhosting.pl/feed/rss/n/24'), + (u'Porady', u'http://webhosting.pl/feed/rss/n/3027'), + (u'Znalezione w sieci', u'http://webhosting.pl/feed/rss/n/6804'), + (u'Dev area', u'http://webhosting.pl/feed/rss/n/24504'), + (u"Webmaster's blog", u'http://webhosting.pl/feed/rss/n/29195'), + (u'Domeny', u'http://webhosting.pl/feed/rss/n/11513'), + (u'Praktyka', u'http://webhosting.pl/feed/rss/n/2'), + (u'Serwery', u'http://webhosting.pl/feed/rss/n/11514'), + (u'Inne', u'http://webhosting.pl/feed/rss/n/24811'), + (u'Marketing', u'http://webhosting.pl/feed/rss/n/11535')] + + def print_version(self, url): + return url.replace('webhosting.pl', 'webhosting.pl/print') \ No newline at end of file