diff --git a/recipes/astroflesz.recipe b/recipes/astroflesz.recipe index 11a56ec6b5..676aedfd3a 100644 --- a/recipes/astroflesz.recipe +++ b/recipes/astroflesz.recipe @@ -2,12 +2,12 @@ from calibre.web.feeds.news import BasicNewsRecipe class Astroflesz(BasicNewsRecipe): - title = u'Astroflesz' + title = u'Astroflesz' oldest_article = 7 - __author__ = 'fenuks' - description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne' - category = 'astronomy' - language = 'pl' + __author__ = 'fenuks' + description = u'astroflesz.pl - to portal poświęcony astronomii. Informuje zarówno o aktualnych wydarzeniach i odkryciach naukowych, jak również zapowiada ciekawe zjawiska astronomiczne' + category = 'astronomy' + language = 'pl' cover_url = 'http://www.astroflesz.pl/templates/astroflesz/images/logo/logo.png' ignore_duplicate_articles = {'title', 'url'} max_articles_per_feed = 100 @@ -17,7 +17,7 @@ class Astroflesz(BasicNewsRecipe): keep_only_tags = [dict(id="k2Container")] remove_tags_after = dict(name='div', attrs={'class':'itemLinks'}) remove_tags = [dict(name='div', attrs={'class':['itemLinks', 'itemToolbar', 'itemRatingBlock']})] - feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')] + feeds = [(u'Wszystkie', u'http://astroflesz.pl/?format=feed')] def postprocess_html(self, soup, first_fetch): t = soup.find(attrs={'class':'itemIntroText'}) diff --git a/recipes/badania_net.recipe b/recipes/badania_net.recipe index 01499f6369..c47e9b6f54 100644 --- a/recipes/badania_net.recipe +++ b/recipes/badania_net.recipe @@ -1,17 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe +import re class BadaniaNet(BasicNewsRecipe): - title = u'badania.net' + title = u'badania.net' __author__ = 'fenuks' - description = u'chcesz wiedzieć więcej?' - category = 'science' - language = 'pl' + description = u'chcesz wiedzieć więcej?' + category = 'science' + language = 'pl' cover_url = 'http://badania.net/wp-content/badanianet_green_transparent.png' + extra_css = '.alignleft {float:left; margin-right:5px;} .alignright {float:right; margin-left:5px;}' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True + preprocess_regexps = [(re.compile(r"

Tekst sponsoruje

", re.IGNORECASE), lambda m: ''),] remove_empty_feeds = True use_embedded_content = False remove_tags = [dict(attrs={'class':['omc-flex-category', 'omc-comment-count', 'omc-single-tags']})] remove_tags_after = dict(attrs={'class':'omc-single-tags'}) keep_only_tags = [dict(id='omc-full-article')] - feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] + feeds = [(u'Psychologia', u'http://badania.net/category/psychologia/feed/'), (u'Technologie', u'http://badania.net/category/technologie/feed/'), (u'Biologia', u'http://badania.net/category/biologia/feed/'), (u'Chemia', u'http://badania.net/category/chemia/feed/'), (u'Zdrowie', u'http://badania.net/category/zdrowie/'), (u'Seks', u'http://badania.net/category/psychologia-ewolucyjna-tematyka-seks/feed/')] \ No newline at end of file diff --git a/recipes/film_org_pl.recipe b/recipes/film_org_pl.recipe index fa0a69912b..4d4ba23e88 100644 --- a/recipes/film_org_pl.recipe +++ b/recipes/film_org_pl.recipe @@ -1,20 +1,54 @@ # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment import re class FilmOrgPl(BasicNewsRecipe): - title = u'Film.org.pl' - __author__ = 'fenuks' - description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce." - category = 'film' - language = 'pl' - extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;}' + title = u'Film.org.pl' + __author__ = 'fenuks' + description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce." + category = 'film' + language = 'pl' + extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;} .recenzja-title {font-size: 150%; margin-top: 5px; margin-bottom: 5px;}' cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png' ignore_duplicate_articles = {'title', 'url'} oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True + remove_javascript = True remove_empty_feeds = True - use_embedded_content = True - preprocess_regexps = [(re.compile(ur'

Przeczytaj także:

.*', re.IGNORECASE|re.DOTALL), lambda m: ''), (re.compile(ur'
Artykuł
', re.IGNORECASE), lambda m: ''), (re.compile(ur'
Ludzie filmu
', re.IGNORECASE), lambda m: '')] - remove_tags = [dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']})] - feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')] + use_embedded_content = False + remove_attributes = ['style'] + preprocess_regexps = [(re.compile(ur'

Przeczytaj także:

.*', re.IGNORECASE|re.DOTALL), lambda m: ''), (re.compile(ur'', re.IGNORECASE|re.DOTALL), lambda m: ''), (re.compile(ur'
Artykuł
', re.IGNORECASE), lambda m: ''), (re.compile(ur'
Ludzie filmu
', re.IGNORECASE), lambda m: ''), (re.compile(ur'(
\s*?){2,}', re.IGNORECASE|re.DOTALL), lambda m: '')] + keep_only_tags = [dict(name=['h11', 'h16', 'h17']), dict(attrs={'class':'editor'})] + remove_tags_after = dict(id='comments') + remove_tags = [dict(name=['link', 'meta', 'style']), dict(name='img', attrs={'alt':['Ludzie filmu', u'Artykuł']}), dict(id='comments'), dict(attrs={'style':'border: 0pt none ; margin: 0pt; padding: 0pt;'}), dict(name='p', attrs={'class':'rating'}), dict(attrs={'layout':'button_count'})] + feeds = [(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), (u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), (u'Analiza', u'http://film.org.pl/a/analiza/feed/'), (u'Ranking', u'http://film.org.pl/a/ranking/feed/'), (u'Blog', u'http://film.org.pl/kmf/blog/feed/'), (u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), (u'Seriale', u'http://film.org.pl/a/seriale/feed/'), (u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), (u'VHS', u'http://film.org.pl/a/vhs-a/feed/')] + + def append_page(self, soup, appendtag): + tag = soup.find('div', attrs={'class': 'pagelink'}) + if tag: + for nexturl in tag.findAll('a'): + url = nexturl['href'] + soup2 = self.index_to_soup(url) + pagetext = soup2.find(attrs={'class': 'editor'}) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + for r in appendtag.findAll(attrs={'class': 'pagelink'}): + r.extract() + for r in appendtag.findAll(attrs={'id': 'comments'}): + r.extract() + for r in appendtag.findAll(attrs={'style':'border: 0pt none ; margin: 0pt; padding: 0pt;'}): + r.extract() + for r in appendtag.findAll(attrs={'layout':'button_count'}): + r.extract() + + def preprocess_html(self, soup): + for c in soup.findAll('h11'): + c.name = 'h1' + self.append_page(soup, soup.body) + for r in soup.findAll('br'): + r.extract() + return soup \ No newline at end of file diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe index baaac85492..67d18737f9 100644 --- a/recipes/gram_pl.recipe +++ b/recipes/gram_pl.recipe @@ -2,22 +2,22 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class Gram_pl(BasicNewsRecipe): - title = u'Gram.pl' - __author__ = 'fenuks' - description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.' - category = 'games' - language = 'pl' + title = u'Gram.pl' + __author__ = 'fenuks' + description = u'Serwis społecznościowy o grach: recenzje, newsy, zapowiedzi, encyklopedia gier, forum. Gry PC, PS3, X360, PS Vita, sprzęt dla graczy.' + category = 'games' + language = 'pl' oldest_article = 8 index='http://www.gram.pl' max_articles_per_feed = 100 ignore_duplicate_articles = {'title', 'url'} no_stylesheets= True remove_empty_feeds = True - #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' + #extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}' cover_url=u'http://www.gram.pl/www/01/img/grampl_zima.png' keep_only_tags= [dict(id='articleModule')] - remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']})] - feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), + remove_tags = [dict(attrs={'class':['breadCrump', 'dymek', 'articleFooter', 'twitter-share-button']}), dict(name='aside')] + feeds = [(u'Informacje', u'http://www.gram.pl/feed_news.asp'), (u'Publikacje', u'http://www.gram.pl/feed_news.asp?type=articles') ] @@ -46,4 +46,4 @@ class Gram_pl(BasicNewsRecipe): tag=soup.find(name='span', attrs={'class':'platforma'}) if tag: tag.name = 'p' - return soup + return soup \ No newline at end of file diff --git a/recipes/historia_pl.recipe b/recipes/historia_pl.recipe index 60554c0924..2402a3a64d 100644 --- a/recipes/historia_pl.recipe +++ b/recipes/historia_pl.recipe @@ -1,27 +1,22 @@ from calibre.web.feeds.news import BasicNewsRecipe class Historia_org_pl(BasicNewsRecipe): - title = u'Historia.org.pl' - __author__ = 'fenuks' - description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.' - cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg' - category = 'history' - language = 'pl' + title = u'Historia.org.pl' + __author__ = 'fenuks' + description = u'Artykuły dotyczące historii w układzie epok i tematów, forum. Najlepsza strona historii. Matura z historii i egzamin gimnazjalny z historii.' + cover_url = 'http://lh3.googleusercontent.com/_QeRQus12wGg/TOvHsZ2GN7I/AAAAAAAAD_o/LY1JZDnq7ro/logo5.jpg' + category = 'history' + language = 'pl' oldest_article = 8 + extra_css = 'img {float: left; margin-right: 10px;} .alignleft {float: left; margin-right: 10px;}' remove_empty_feeds= True no_stylesheets = True use_embedded_content = True max_articles_per_feed = 100 ignore_duplicate_articles = {'title', 'url'} - - - feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'), - (u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'), - (u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'), - (u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'), - (u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'), - (u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),] - - - def print_version(self, url): - return url + '?tmpl=component&print=1&layout=default&page=' \ No newline at end of file + feeds = [(u'Wszystkie', u'http://historia.org.pl/feed/'), + (u'Wiadomości', u'http://historia.org.pl/Kategoria/wiadomosci/feed/'), + (u'Publikacje', u'http://historia.org.pl/Kategoria/artykuly/feed/'), + (u'Publicystyka', u'http://historia.org.pl/Kategoria/publicystyka/feed/'), + (u'Recenzje', u'http://historia.org.pl/Kategoria/recenzje/feed/'), + (u'Projekty', u'http://historia.org.pl/Kategoria/projekty/feed/'),] \ No newline at end of file diff --git a/recipes/icons/sport_pl.png b/recipes/icons/sport_pl.png new file mode 100644 index 0000000000..83931be40d Binary files /dev/null and b/recipes/icons/sport_pl.png differ diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe index 8c1915db15..09228c15d5 100644 --- a/recipes/infra_pl.recipe +++ b/recipes/infra_pl.recipe @@ -1,21 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe class INFRA(BasicNewsRecipe): - title = u'INFRA' + title = u'INFRA' oldest_article = 7 max_articles_per_feed = 100 - __author__ = 'fenuks' - description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.' - cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg' - category = 'UFO' + __author__ = 'fenuks' + description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.' + cover_url = 'http://i.imgur.com/j7hJT.jpg' + category = 'UFO' index='http://infra.org.pl' - language = 'pl' + language = 'pl' max_articles_per_feed = 100 - no_stylesheers=True - remove_tags_before=dict(name='h2', attrs={'class':'contentheading'}) - remove_tags_after=dict(attrs={'class':'pagenav'}) - remove_tags=[dict(attrs={'class':'pagenav'})] - feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/rss')] + remove_attrs = ['style'] + no_stylesheets = True + keep_only_tags = [dict(id='ja-current-content')] + feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/rss')] def preprocess_html(self, soup): for item in soup.findAll(style=True): @@ -23,4 +22,4 @@ class INFRA(BasicNewsRecipe): for a in soup('a'): if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: a['href']=self.index + a['href'] - return soup + return soup \ No newline at end of file diff --git a/recipes/kdefamily_pl.recipe b/recipes/kdefamily_pl.recipe index 75f88b0f3d..df0aacc2df 100644 --- a/recipes/kdefamily_pl.recipe +++ b/recipes/kdefamily_pl.recipe @@ -1,14 +1,16 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe class KDEFamilyPl(BasicNewsRecipe): - title = u'KDEFamily.pl' - __author__ = 'fenuks' - description = u'KDE w Polsce' - category = 'open source, KDE' - language = 'pl' + title = u'KDEFamily.pl' + __author__ = 'fenuks' + description = u'KDE w Polsce' + category = 'open source, KDE' + language = 'pl' cover_url = 'http://www.mykde.home.pl/kdefamily/wp-content/uploads/2012/07/logotype-e1341585198616.jpg' oldest_article = 7 max_articles_per_feed = 100 + preprocess_regexps = [(re.compile(r"Podobne wpisy.*", re.IGNORECASE|re.DOTALL), lambda m: '')] no_stylesheets = True use_embedded_content = True - feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] + feeds = [(u'Wszystko', u'http://kdefamily.pl/feed/')] \ No newline at end of file diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe index b29e7e243b..8d09c61a7a 100644 --- a/recipes/konflikty_zbrojne.recipe +++ b/recipes/konflikty_zbrojne.recipe @@ -3,10 +3,10 @@ from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class Konflikty(BasicNewsRecipe): - title = u'Konflikty Zbrojne' - __author__ = 'fenuks' - cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg' - language = 'pl' + title = u'Konflikty Zbrojne' + __author__ = 'fenuks' + cover_url = 'http://www.konflikty.pl/images/tapety_logo.jpg' + language = 'pl' description = u'Zbiór ciekawych artykułów historycznych, militarnych oraz recenzji książek, gier i filmów. Najświeższe informacje o lotnictwie, wojskach lądowych i polityce.' category='military, history' oldest_article = 7 @@ -14,19 +14,20 @@ class Konflikty(BasicNewsRecipe): no_stylesheets = True keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')] - feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), - (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'), - (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'), - (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), - (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'), - (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')] + feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), + (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'), + (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'), + (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), + (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'), + (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')] def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for image in soup.findAll(name='a', attrs={'class':'image'}): + image['style'] = 'width: 210px; float: left; margin-right:5px;' if image.img and image.img.has_key('alt'): image.name='div' pos = len(image.contents) image.insert(pos, BeautifulSoup('

'+image.img['alt']+'

')) - return soup + return soup \ No newline at end of file diff --git a/recipes/kosmonauta_pl.recipe b/recipes/kosmonauta_pl.recipe index 98628d667a..d943739832 100644 --- a/recipes/kosmonauta_pl.recipe +++ b/recipes/kosmonauta_pl.recipe @@ -2,12 +2,13 @@ from calibre.web.feeds.news import BasicNewsRecipe class Kosmonauta(BasicNewsRecipe): - title = u'Kosmonauta.net' - __author__ = 'fenuks' - description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.' - category = 'astronomy' - language = 'pl' + title = u'Kosmonauta.net' + __author__ = 'fenuks' + description = u'polskojęzyczny portal w całości dedykowany misjom kosmicznym i badaniom kosmosu.' + category = 'astronomy' + language = 'pl' cover_url = 'http://bi.gazeta.pl/im/4/10393/z10393414X,Kosmonauta-net.jpg' + extra_css = '.thumbnail {float:left;margin-right:5px;}' no_stylesheets = True INDEX = 'http://www.kosmonauta.net' oldest_article = 7 @@ -16,9 +17,12 @@ class Kosmonauta(BasicNewsRecipe): remove_attributes = ['style'] max_articles_per_feed = 100 keep_only_tags = [dict(name='div', attrs={'class':'item-page'})] - remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']})] + remove_tags = [dict(attrs={'class':['article-tools clearfix', 'cedtag', 'nav clearfix', 'jwDisqusForm']}), dict(attrs={'alt':['Poprzednia strona', 'Następna strona']})] remove_tags_after = dict(name='div', attrs={'class':'cedtag'}) - feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')] + feeds = [(u'Kosmonauta.net', u'http://www.kosmonauta.net/?format=feed&type=atom')] + + def print_version(self, url): + return url + '?tmpl=component&print=1&layout=default&page=' def preprocess_html(self, soup): for a in soup.findAll(name='a'): @@ -26,5 +30,4 @@ class Kosmonauta(BasicNewsRecipe): href = a['href'] if not href.startswith('http'): a['href'] = self.INDEX + href - return soup - + return soup \ No newline at end of file diff --git a/recipes/mlody_technik_pl.recipe b/recipes/mlody_technik_pl.recipe index 4622e73909..dc46bc665d 100644 --- a/recipes/mlody_technik_pl.recipe +++ b/recipes/mlody_technik_pl.recipe @@ -2,13 +2,14 @@ import re from calibre.web.feeds.news import BasicNewsRecipe class Mlody_technik(BasicNewsRecipe): - title = u'Młody technik' - __author__ = 'fenuks' - description = u'Młody technik' - category = 'science' - language = 'pl' + title = u'Młody technik' + __author__ = 'fenuks' + description = u'Młody technik' + category = 'science' + language = 'pl' #cover_url = 'http://science-everywhere.pl/wp-content/uploads/2011/10/mt12.jpg' no_stylesheets = True + extra_css = 'img.alignleft {float: left; margin-right: 5px;}' preprocess_regexps = [(re.compile(r"

Podobne

", re.IGNORECASE), lambda m: '')] oldest_article = 7 max_articles_per_feed = 100 @@ -17,18 +18,18 @@ class Mlody_technik(BasicNewsRecipe): keep_only_tags = [dict(id='content')] remove_tags = [dict(attrs={'class':'st-related-posts'})] remove_tags_after = dict(attrs={'class':'entry-content clearfix'}) - feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), - #(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), - (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), - (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), - (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), - (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), - (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), - (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] + feeds = [(u'Wszystko', u'http://www.mt.com.pl/feed'), + #(u'MT NEWS 24/7', u'http://www.mt.com.pl/kategoria/mt-newsy-24-7/feed'), + (u'Info zoom', u'http://www.mt.com.pl/kategoria/info-zoom/feed'), + (u'm.technik', u'http://www.mt.com.pl/kategoria/m-technik/feed'), + (u'Szkoła', u'http://www.mt.com.pl/kategoria/szkola-2/feed'), + (u'Na Warsztacie', u'http://www.mt.com.pl/kategoria/na-warsztacie/feed'), + (u'Z pasji do...', u'http://www.mt.com.pl/kategoria/z-pasji-do/feed'), + (u'MT testuje', u'http://www.mt.com.pl/kategoria/mt-testuje/feed')] def get_cover_url(self): soup = self.index_to_soup('http://www.mt.com.pl/') tag = soup.find(attrs={'class':'xoxo'}) if tag: self.cover_url = tag.find('img')['src'] - return getattr(self, 'cover_url', self.cover_url) + return getattr(self, 'cover_url', self.cover_url) \ No newline at end of file diff --git a/recipes/nauka_w_polsce.recipe b/recipes/nauka_w_polsce.recipe index c524c18b26..715780d162 100644 --- a/recipes/nauka_w_polsce.recipe +++ b/recipes/nauka_w_polsce.recipe @@ -1,16 +1,18 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class NaukawPolsce(BasicNewsRecipe): - title = u'Nauka w Polsce' - __author__ = 'fenuks' - description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' - category = 'science' - language = 'pl' + title = u'Nauka w Polsce' + __author__ = 'fenuks' + description = u'Serwis Nauka w Polsce ma za zadanie popularyzację polskiej nauki. Można na nim znaleźć wiadomości takie jak: osiągnięcia polskich naukowców, wydarzenia na polskich uczelniach, osiągnięcia studentów, konkursy dla badaczy, staże i stypendia naukowe, wydarzenia w polskiej nauce, kalendarium wydarzeń w nauce, materiały wideo o nauce.' + category = 'science' + language = 'pl' cover_url = 'http://www.naukawpolsce.pap.pl/Themes/Pap/images/logo-pl.gif' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True + extra_css = '.miniaturka {float: left; margin-right: 5px; max-width: 350px;} .miniaturka-dol-strony {display: inline-block; margin: 0 15px; width: 120px;}' + ignore_duplicate_articles = {'title', 'url'} index = 'http://www.naukawpolsce.pl' keep_only_tags = [dict(name='div', attrs={'class':'margines wiadomosc'})] remove_tags = [dict(name='div', attrs={'class':'tagi'})] @@ -23,8 +25,8 @@ class NaukawPolsce(BasicNewsRecipe): url = self.index + i.h1.a['href'] date = '' #i.span.string articles.append({'title' : title, - 'url' : url, - 'date' : date, + 'url' : url, + 'date' : date, 'description' : '' }) return articles @@ -44,4 +46,4 @@ class NaukawPolsce(BasicNewsRecipe): def preprocess_html(self, soup): for p in soup.findAll(name='p', text=re.compile(' ')): p.extract() - return soup + return soup \ No newline at end of file diff --git a/recipes/niebezpiecznik.recipe b/recipes/niebezpiecznik.recipe index 3b321772ec..135db6f4ba 100644 --- a/recipes/niebezpiecznik.recipe +++ b/recipes/niebezpiecznik.recipe @@ -1,17 +1,19 @@ from calibre.web.feeds.news import BasicNewsRecipe class Niebezpiecznik_pl(BasicNewsRecipe): - title = u'Niebezpiecznik.pl' - __author__ = 'fenuks' - description = u'Niebezpiecznik.pl – o bezpieczeństwie i nie...' - category = 'hacking, IT' - language = 'pl' + title = u'Niebezpiecznik.pl' + __author__ = 'fenuks' + description = u'Niebezpiecznik.pl – o bezpieczeństwie i nie...' + category = 'hacking, IT' + language = 'pl' oldest_article = 8 + extra_css = '.entry {margin-top: 25px;}' + remove_attrs = ['style'] max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True cover_url = u'http://userlogos.org/files/logos/Karmody/niebezpiecznik_01.png' remove_tags = [dict(name='div', attrs={'class':['sociable']}), dict(name='h4'), dict(attrs={'class':'similar-posts'})] keep_only_tags = [dict(name='div', attrs={'class':['title', 'entry']})] - feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'), - ('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')] + feeds = [(u'Wiadomości', u'http://feeds.feedburner.com/niebezpiecznik/'), + ('Blog', 'http://feeds.feedburner.com/niebezpiecznik/linkblog/')] \ No newline at end of file diff --git a/recipes/osworld_pl.recipe b/recipes/osworld_pl.recipe index 7784a271e0..20cb546e35 100644 --- a/recipes/osworld_pl.recipe +++ b/recipes/osworld_pl.recipe @@ -1,11 +1,12 @@ from calibre.web.feeds.news import BasicNewsRecipe class OSWorld(BasicNewsRecipe): - title = u'OSWorld.pl' - __author__ = 'fenuks' - description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' - category = 'OS, IT, open source, Linux' - language = 'pl' + title = u'OSWorld.pl' + __author__ = 'fenuks' + description = u'OSWorld.pl to serwis internetowy, dzięki któremu poznasz czym naprawdę jest Open Source. Serwis poświęcony jest wolnemu oprogramowaniu jak linux mint, centos czy ubunty. Znajdziecie u nasz artykuły, unity oraz informacje o certyfikatach CACert. OSWorld to mały świat wielkich systemów!' + category = 'OS, IT, open source, Linux' + language = 'pl' cover_url = 'http://osworld.pl/wp-content/uploads/osworld-kwadrat-128x111.png' + extra_css = 'img.alignleft {float: left; margin-right: 5px;}' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True @@ -14,7 +15,7 @@ class OSWorld(BasicNewsRecipe): keep_only_tags = [dict(id=['dzial', 'posts'])] remove_tags = [dict(attrs={'class':'post-comments'})] remove_tags_after = dict(attrs={'class':'entry clr'}) - feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] + feeds = [(u'Artyku\u0142y', u'http://osworld.pl/category/artykuly/feed/'), (u'Nowe wersje', u'http://osworld.pl/category/nowe-wersje/feed/')] def append_page(self, soup, appendtag): tag = appendtag.find(attrs={'id':'paginacja'}) @@ -30,4 +31,4 @@ class OSWorld(BasicNewsRecipe): def preprocess_html(self, soup): self.append_page(soup, soup.body) - return soup + return soup \ No newline at end of file diff --git a/recipes/pc_centre_pl.recipe b/recipes/pc_centre_pl.recipe index f4eccd70a0..6f113bfcc6 100644 --- a/recipes/pc_centre_pl.recipe +++ b/recipes/pc_centre_pl.recipe @@ -1,20 +1,21 @@ from calibre.web.feeds.news import BasicNewsRecipe class PC_Centre(BasicNewsRecipe): - title = u'PC Centre' + title = u'PC Centre' oldest_article = 7 max_articles_per_feed = 100 - __author__ = 'fenuks' - description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' - category = 'IT' - language = 'pl' + __author__ = 'fenuks' + description = u'Portal komputerowy, a w nim: testy sprzętu komputerowego, recenzje gier i oprogramowania. a także opisy produktów związanych z komputerami.' + category = 'IT' + language = 'pl' masthead_url= 'http://pccentre.pl/views/images/logo.gif' cover_url= 'http://pccentre.pl/views/images/logo.gif' no_stylesheets = True remove_empty_feeds = True + ignore_duplicate_articles = {'title', 'url'} #keep_only_tags= [dict(id='content')] #remove_tags=[dict(attrs={'class':['ikony r', 'list_of_content', 'dot accordion']}), dict(id='comments')] remove_tags=[dict(attrs={'class':'logo_print'})] - feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] + feeds = [(u'Aktualno\u015bci', u'http://pccentre.pl/backend.php'), (u'Publikacje', u'http://pccentre.pl/backend.php?mode=a'), (u'Sprz\u0119t komputerowy', u'http://pccentre.pl/backend.php?mode=n§ion=2'), (u'Oprogramowanie', u'http://pccentre.pl/backend.php?mode=n§ion=3'), (u'Gry komputerowe i konsole', u'http://pccentre.pl/backend.php?mode=n§ion=4'), (u'Internet', u'http://pccentre.pl/backend.php?mode=n§ion=7'), (u'Bezpiecze\u0144stwo', u'http://pccentre.pl/backend.php?mode=n§ion=5'), (u'Multimedia', u'http://pccentre.pl/backend.php?mode=n§ion=6'), (u'Biznes', u'http://pccentre.pl/backend.php?mode=n§ion=9')] def print_version(self, url): return url.replace('show', 'print') \ No newline at end of file diff --git a/recipes/sport_pl.recipe b/recipes/sport_pl.recipe new file mode 100644 index 0000000000..622a3675bd --- /dev/null +++ b/recipes/sport_pl.recipe @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = 'teepel 2012' + +''' +sport.pl +''' + +from calibre.web.feeds.news import BasicNewsRecipe +import re + +class sport_pl(BasicNewsRecipe): + title = 'Sport.pl' + __author__ = 'teepel ' + language = 'pl' + description =u'Największy portal sportowy w Polsce. Wiadomości sportowe z najważniejszych wydarzeń, relacje i wyniki meczów na żywo.' + masthead_url='http://press.gazeta.pl/file/mediakit/154509/c8/sportpl.jpg' + oldest_article = 1 + max_articles_per_feed = 100 + remove_javascript=True + no_stylesheets=True + remove_empty_feeds = True + + keep_only_tags =[] + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article'})) + + remove_tags =[] + remove_tags.append(dict(name = 'a', attrs = {'href' : 'www.gazeta.pl'})) + + feeds = [ + (u'Wszystkie wiadomości', u'http://rss.gazeta.pl/pub/rss/sport.xml'), + (u'Piłka nożna', u'http://www.sport.pl/pub/rss/sport/pilka_nozna.htm'), + (u'F1', u'http://www.sport.pl/pub/rss/sportf1.htm'), + (u'Tenis', u'http://serwisy.gazeta.pl/pub/rss/tenis.htm'), + (u'Siatkówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611628/index.rss'), + (u'Koszykówka', u'http://gazeta.pl.feedsportal.com/c/32739/f/611647/index.rss'), + (u'Piłka ręczna', u'http://gazeta.pl.feedsportal.com/c/32739/f/611635/index.rss'), + (u'Inne sporty', u'http://gazeta.pl.feedsportal.com/c/32739/f/611649/index.rss'), + ] + def parse_feeds(self): + feeds = BasicNewsRecipe.parse_feeds(self) + for feed in feeds: + for article in feed.articles[:]: + if '[ZDJĘCIA]' in article.title: + article.title = article.title.replace('[ZDJĘCIA]','') + elif '[WIDEO]' in article.title: + article.title = article.title.replace('[WIDEO]','') + return feeds + + def print_version(self, url): + if 'feedsportal' in url: + segment = url.split('/') + urlPart = segment[-2] + urlPart = urlPart.replace('0L0Ssport0Bpl0C','') + urlPart = urlPart.replace('0C10H','/') + urlPart = urlPart.replace('0H',',') + urlPart = urlPart.replace('0I','_') + urlPart = urlPart.replace('A','') + segment1 = urlPart.split('/') + seg1 = segment1[0] + seg2 = segment1[1] + segment2 = seg2.split(',') + part = segment2[0] + ',' + segment2[1] + return 'http://www.sport.pl/' + seg1 + '/2029020,' + part + '.html' + else: + segment = url.split('/') + part2 = segment[-2] + part1 = segment[-1] + segment2 = part1.split(',') + part = segment2[1] + ',' + segment2[2] + return 'http://www.sport.pl/' + part2 + '/2029020,' + part + '.html' diff --git a/recipes/stopklatka.recipe b/recipes/stopklatka.recipe index ace28087c4..28c92a2453 100644 --- a/recipes/stopklatka.recipe +++ b/recipes/stopklatka.recipe @@ -1,25 +1,46 @@ +__license__ = 'GPL v3' from calibre.web.feeds.news import BasicNewsRecipe -import re -class Stopklatka(BasicNewsRecipe): - title = u'Stopklatka' - __author__ = 'fenuks' - description = u'Stopklatka.pl to najdłużej działający polski portal filmowy. Baza filmów, seriali i aktorów, repertuar kin, program tv, wydarzenia ze świata filmu' - category = 'movies' - language = 'pl' +from calibre.ebooks.BeautifulSoup import Comment +class Stopklatka_pl(BasicNewsRecipe): + title = u'Stopklatka.pl' + __author__ = 'fenuks' + description = u'Stopklatka.pl to najdłużej działający polski portal filmowy. Baza filmów, seriali i aktorów, repertuar kin, program tv, wydarzenia ze świata filmu' + category = 'movies' + language = 'pl' + encoding = 'utf-8' + extra_css = 'img {display: block;} ul {list-style-type: none;} li {display: inline;}' + cover_url = 'http://static1.stopklatka.pl/images/20/19/11501.jpg' + use_embedded_content = False oldest_article = 7 - masthead_url= 'http://img.stopklatka.pl/logo/logo-3.gif' - cover_url= 'http://img.stopklatka.pl/logo/logo-3.gif' max_articles_per_feed = 100 no_stylesheets = True - preprocess_regexps = [(re.compile(ur'Wersja internetowa dostępna jest pod adresem:.*', re.DOTALL), lambda match: ''), (re.compile(ur'', re.DOTALL), lambda match: '') ] remove_empty_feeds = True - remove_tags = [dict(name='img', attrs={'alt':'logo'})] - feeds = [(u'Wydarzenia', u'http://rss.stopklatka.pl/wydarzenia.rss')] + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} - def print_version(self, url): - link_id = re.search(r'wi=(?P\d+)', url) - if link_id: - return 'http://www.stopklatka.pl/narzedzia/drukuj.asp?typ=wydarzenie&id=' + link_id.group('id') - else: - return url + keep_only_tags = [dict(attrs={'class':'asset-full-content default-asset-publisher show-asset-title'})] + remove_tags = [dict(attrs={'class':['metadata-entry metadata-tags', 'print-action', 'asset-flag', 'asset-ratings']}), dict(id='contest')] + #remove_tags_after = dict() + #remove_tags_before = dict() + feeds = [(u'Wiadomo\u015bci', u'http://stopklatka.pl/wiadomosci/-/asset_publisher/Hl7x4Ku4GpZj/rss?p_p_cacheability=cacheLevelPage'), (u'Artyku\u0142y', u'http://stopklatka.pl/artykuly/-/asset_publisher/pKhn5s0IxqSc/rss?p_p_cacheability=cacheLevelPage'), (u'Premiery i zapowiedzi', u'http://stopklatka.pl/premiery-i-zapowiedzi?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=3&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13393201&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13760176&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=15238425&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13470227&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13913324&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=20234402&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13917041&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13905169&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=14253975&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=21586017&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13540662&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=12999052&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=45280408&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=14826890&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13459998&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13070805&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=20209965&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=21741457&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=35577381&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_assetEntryIds=13530138&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13392987%2Cmartwe-zlo&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13760162%2Cuklad-zamkniety&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F15238403%2Cwszyscy-w-naszej-rodzinie&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13470213%2Cdonoma&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13913310%2Ccristiada&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F20234381%2Craj-wiara&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13917027%2Cintruz&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13905155%2Cspring-breakers&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F14253957%2Ckrudowie&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F21586004%2Cswieta-czworca&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13540648%2Ckwartet&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F12999038%2Cimagine&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45280404%2Cdom-na-kolkach&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F14826876%2Cg-i-joe-odwet&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13459984%2Cnieobliczalni&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13070591%2Csamotny-port-milosc&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F20209952%2Czanim-noc-nas-nie-rozdzieli&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F21741444%2Chemel&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35577377%2Czywie-bielarus-&_eventsearch_WAR_eventsearchportlet_INSTANCE_FLRWmpE7H8IL_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F13530124%2Cpanaceum'), (u'Recenzje filmowe', u'http://stopklatka.pl/box-office/-/asset_publisher/3yxqotUEiqHJ/rss?p_p_cacheability=cacheLevelPage'), (u'Recenzje', u'http://stopklatka.pl/recenzje/-/asset_publisher/5oZ3s2J3L0tG/rss?p_p_cacheability=cacheLevelPage'), (u'Gwiazdy', u'http://stopklatka.pl/czerwony-dywan/-/asset_publisher/PqN7MDEGWGvh/rss?p_p_cacheability=cacheLevelPage'), (u'Wywiady Stopklatki', u'http://stopklatka.pl/wywiady/-/asset_publisher/uVh3OrZCaLd7/rss?p_p_cacheability=cacheLevelPage'), (u'Prosto z Hollywood', u'http://stopklatka.pl/wywiady-z-hollywood/-/asset_publisher/YsbU0JSoxb9G/rss?p_p_cacheability=cacheLevelPage'), (u'Plotki', u'http://stopklatka.pl/czerwony-dywan/-/asset_publisher/XuF8EGAkVeTa/rss?p_p_cacheability=cacheLevelPage'), (u'Box Office Polska', u'http://stopklatka.pl/box-office?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=5&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_assetEntryIds=47982267&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_assetEntryIds=46685247&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_assetEntryIds=45280313&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47982263%2Cbox-office-weekendowy-polska-15-03-2013-17-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F46685243%2Cbox-office-weekendowy-polska-08-03-2013-10-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_Gqb98cI5dgSJ_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45280309%2Cbox-office-weekendowy-polska-01-03-2013-03-03-2013'), (u'Box Office USA', u'http://stopklatka.pl/box-office?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=2&p_p_col_count=5&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=49047234&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=48879358&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=47605057&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=47809980&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_assetEntryIds=46505246&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F49047230%2Cbox-office-weekendowy-stany-zjednoczone-22-03-2013-24-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48879354%2Cbox-office-weekendowy-stany-zjednoczone-22-03-2013-24-03-2013-estymacja-&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47605053%2Cbox-office-weekendowy-stany-zjednoczone-15-03-2013-17-03-2013-estymacja-&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47809976%2Cbox-office-weekendowy-stany-zjednoczone-15-03-2013-17-03-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_24AXs0agMxJd_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F46505242%2Cbox-office-weekendowy-stany-zjednoczone-08-03-2013-10-03-2013'), (u'Relacje', u'http://stopklatka.pl/czerwony-dywan/-/asset_publisher/IkgAkSFxLWV2/rss?p_p_cacheability=cacheLevelPage'), (u'Kalendarium imprez', u'http://stopklatka.pl/kalendarium-imprez?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=3&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628974&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47627805&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=45317244&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=48884855&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47629292&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=48884742&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482058&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47627893&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482076&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47627838&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=48167620&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482067&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47811744&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482049&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47629615&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=45088670&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628531&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481950&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481496&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482022&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=19323743&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628034&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=47628064&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=45088819&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35482031&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481415&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481977&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=19323617&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481932&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_assetEntryIds=35481995&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628970%2C4-festiwal-filmow-swiata-trzy-zywioly&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47627801%2Cwielka-podroz-krudow&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45317240%2C6-przeglad-kina-rosyjskiego-nowe-kino-rosyjskie-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48884851%2C2-1-nowy-cykl-spotkan-literatury-z-filmem&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47629288%2C5-festiwal-polskich-filmow-krotkometrazowych-short-waves&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48884738%2Cmoico-enjoy-movies-przeglad-filmow-klasy-b-we-wroclawiu&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482054%2C1-ogolnopolski-festiwal-polskiej-animacji-o-pla-2013-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47627889%2Cviii-festiwal-filmow-afrykanskich-afrykamera-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482072%2C6-miedzynarodowy-festwial-kina-niezaleznego-off-plus-camera&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47627834%2C11-przeglad-filmow-studenckich-z-lodzkiej-filmowki-lodzia-po-wisle-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48167616%2Cweze-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482063%2Cxiv-festiwal-kina-amatorskiego-i-niezaleznego-kan&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47811740%2Cv-festiwal-muzyki-filmowej-krzysztofa-komedy&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482045%2Ckonkurs-scenariuszowy-script-pro-2013&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47629611%2C9-miedzynarodowy-festiwal-filmowy-%E2%80%9Ezydowskie-motywy%E2%80%9D&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45088666%2C15-przeglad-filmowy-cieszyn-kino-na-granicy-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628527%2Cdzien-filmowca-filmmaker-s-day&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481946%2C10-planete-doc-film-festival&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481492%2C66-miedzynarodowy-festiwal-filmowy-w-cannes&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482018%2C16-festiwal-filmow-kultowych&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F19323739%2C53-krakowski-festiwal-filmowy&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628030%2C4-festiwal-filmow-mlodziezowych-18&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47628060%2Cvii-superorbitalny-festiwal-filmow-amatorskich-soffa&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F45088815%2Cxv-miedzynarodowy-festiwal-filmow-przyrodniczych-im-wlodzimierza-puchalskiego&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35482027%2C32-koszalinski-festiwal-debiutow-filmowych-mlodzi-i-film-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481411%2C6-miedzynarodowy-festiwal-filmow-animowanych-animator-&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481973%2C13-miedzynarodowy-festiwal-filmowy-sopot-film-festival&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F19323613%2C13-miedzynarodowy-festiwal-filmowy-t-mobile-nowe-horyzonty&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481928%2C7-festiwal-filmu-i-sztuki-dwa-brzegi-w-kazimierzu-dolnym-i-janowcu&_eventsearch_WAR_eventsearchportlet_INSTANCE_rHUXlm2Y2veh_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F35481991%2C3-miedzynarodowy-festiwal-filmu-i-muzyki-transatlantyk'), (u'Konkursy', u'http://stopklatka.pl/konkursy?p_p_id=eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=3&p_p_col_count=5&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_assetEntryIds=47091950&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_assetEntryIds=48879762&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_assetEntryIds=48880109&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F47091941%2Cksiazki-dwie-kobiety-&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48879753%2Cdvd-rozmowy-noca-&_eventsearch_WAR_eventsearchportlet_INSTANCE_Yks1FKgVbrOA_urls=http%3A%2F%2Fstopklatka.pl%2F-%2F48880000%2Cdvd-milosc-'), (u'Komiks Stopklatki', u'http://stopklatka.pl/komiks/-/asset_publisher/pKhn5s0IxqSc/rss?p_p_cacheability=cacheLevelPage')] + def append_page(self, soup, appendtag): + tag = soup.find('a', attrs={'class': 'next'}) + if tag: + while tag: + url = tag['href'] + soup2 = self.index_to_soup(url) + tag = soup2.find('a', attrs={'class': 'next'}) + pagetext = soup2.find(attrs={'class': 'journal-content-article'}) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + appendtag.find('a', attrs={'class': 'next'}).extract() + + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/tablety_pl.recipe b/recipes/tablety_pl.recipe index 97a44f81c7..c0d8b66c3d 100644 --- a/recipes/tablety_pl.recipe +++ b/recipes/tablety_pl.recipe @@ -1,18 +1,20 @@ from calibre.web.feeds.news import BasicNewsRecipe import re class Tablety_pl(BasicNewsRecipe): - title = u'Tablety.pl' - __author__ = 'fenuks' - description = u'Tablety, gry i aplikacje na tablety.' + title = u'Tablety.pl' + __author__ = 'fenuks' + description = u'Tablety, gry i aplikacje na tablety.' masthead_url= 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' - cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' - category = 'IT' - language = 'pl' - use_embedded_content=True + cover_url = 'http://www.tablety.pl/wp-content/themes/kolektyw/img/logo.png' + category = 'IT' + language = 'pl' + use_embedded_content = False + no_stylesheets = True oldest_article = 8 max_articles_per_feed = 100 preprocess_regexps = [(re.compile(ur'

Przeczytaj także.*?

', re.DOTALL), lambda match: ''), (re.compile(ur'

Przeczytaj koniecznie.*?

', re.DOTALL), lambda match: '')] + keep_only_tags = [dict(id='news_block')] #remove_tags_before=dict(name="h1", attrs={'class':'entry-title'}) #remove_tags_after=dict(name="footer", attrs={'class':'entry-footer clearfix'}) - #remove_tags=[dict(name='footer', attrs={'class':'entry-footer clearfix'}), dict(name='div', attrs={'class':'entry-comment-counter'})] - feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] + remove_tags=[dict(attrs={'class':['comments_icon', 'wp-polls', 'entry-comments']})] + feeds = [(u'Najnowsze posty', u'http://www.tablety.pl/feed/')] \ No newline at end of file diff --git a/recipes/wirtualnemedia_pl.recipe b/recipes/wirtualnemedia_pl.recipe index 155cafbec2..28278c2e24 100644 --- a/recipes/wirtualnemedia_pl.recipe +++ b/recipes/wirtualnemedia_pl.recipe @@ -1,21 +1,22 @@ from calibre.web.feeds.news import BasicNewsRecipe class WirtualneMedia(BasicNewsRecipe): - title = u'wirtualnemedia.pl' + title = u'wirtualnemedia.pl' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False remove_empty_feeds = True - __author__ = 'fenuks' + __author__ = 'fenuks' extra_css = '.thumbnail {float:left; max-width:150px; margin-right:5px;}' - description = u'Portal o mediach, reklamie, internecie, PR, telekomunikacji - nr 1 w Polsce - WirtualneMedia.pl - wiadomości z pierwszej ręki.' - category = 'internet' - language = 'pl' + description = u'Portal o mediach, reklamie, internecie, PR, telekomunikacji - nr 1 w Polsce - WirtualneMedia.pl - wiadomości z pierwszej ręki.' + category = 'internet' + language = 'pl' + ignore_duplicate_articles = {'title', 'url'} masthead_url= 'http://i.wp.pl/a/f/jpeg/8654/wirtualnemedia.jpeg' cover_url= 'http://static.wirtualnemedia.pl/img/logo_wirtualnemedia_newsletter.gif' remove_tags=[dict(id=['header', 'footer'])] - feeds = [(u'Gospodarka', u'http://www.wirtualnemedia.pl/rss/wm_gospodarka.xml'), + feeds = [(u'Gospodarka', u'http://www.wirtualnemedia.pl/rss/wm_gospodarka.xml'), (u'Internet', u'http://www.wirtualnemedia.pl/rss/wm_internet.xml'), (u'Kultura', u'http://www.wirtualnemedia.pl/rss/wm_kulturarozrywka.xml'), (u'Badania', u'http://www.wirtualnemedia.pl/rss/wm_marketing.xml'), @@ -24,8 +25,6 @@ class WirtualneMedia(BasicNewsRecipe): (u'Reklama', u'http://www.wirtualnemedia.pl/rss/wm_reklama.xml'), (u'PR', u'http://www.wirtualnemedia.pl/rss/wm_relations.xml'), (u'Technologie', u'http://www.wirtualnemedia.pl/rss/wm_telekomunikacja.xml'), - (u'Telewizja', u'http://www.wirtualnemedia.pl/rss/wm_telewizja_rss.xml') - ] - + (u'Telewizja', u'http://www.wirtualnemedia.pl/rss/wm_telewizja_rss.xml')] def print_version(self, url): return url.replace('artykul', 'print')