diff --git a/recipes/cdrinfo_pl.recipe b/recipes/cdrinfo_pl.recipe new file mode 100644 index 0000000000..2a8b3b9a2e --- /dev/null +++ b/recipes/cdrinfo_pl.recipe @@ -0,0 +1,65 @@ +__license__ = 'GPL v3' +import re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment +class cdrinfo(BasicNewsRecipe): + title = u'CDRinfo.pl' + __author__ = 'fenuks' + description = u'Serwis poświęcony archiwizacji danych. Testy i recenzje nagrywarek. Programy do nagrywania płyt. Dyski twarde, dyski SSD i serwery sieciowe NAS. Rankingi dyskow twardych, najszybsze dyski twarde, newsy, artykuły, testy, recenzje, porady, oprogramowanie. Zestawienie nagrywarek, najnowsze biosy do nagrywarek, programy dla dysków twardych.' + category = 'it, hardware' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://www.cdrinfo.pl/gfx/graph3/top.jpg' + #masthead_url = '' + use_embedded_content = False + oldest_article = 777 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style'] + preprocess_regexps = [(re.compile(u'
]*?>Uprzejmie prosimy o przestrzeganie netykiety.+?www\.gravatar\.com\.
', re.DOTALL), lambda match: '')] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(name='input', attrs={'name':'ref'}), dict(id='text')] + remove_tags = [dict(attrs={'class':['navigation', 'sociable']}), dict(name='hr'), dict(id='respond')] + remove_tags_after = dict(id='artnawigacja') + feeds = [(u'Wiadomości', 'http://feeds.feedburner.com/cdrinfo'), (u'Recenzje', 'http://www.cdrinfo.pl/rss/rss_recenzje.php'), + (u'Konsole', 'http://konsole.cdrinfo.pl/rss/rss_konsole_news.xml'), + (u'Pliki', 'http://www.cdrinfo.pl/rss/rss_pliki.xml') + ] + + def preprocess_html(self, soup): + if soup.find(id='artnawigacja'): + self.append_page(soup, soup.body) + return soup + + def append_page(self, soup, appendtag): + baseurl = 'http://cdrinfo.pl' + soup.find(name='input', attrs={'name':'ref'})['value'] + '/' + if baseurl[-2] == '/': + baseurl = baseurl[:-1] + tag = soup.find(id='artnawigacja') + div = tag.find('div', attrs={'align':'right'}) + while div: + counter = 0 + while counter < 5: + try: + soup2 = self.index_to_soup(baseurl+div.a['href']) + break + except: + counter += 1 + tag2 = soup2.find(id='artnawigacja') + div = tag2.find('div', attrs={'align':'right'}) + pagetext = soup2.find(attrs={'class':'art'}) + comments = pagetext.findAll(text=lambda text:isinstance(text, Comment)) + for comment in comments: + comment.extract() + for r in soup2.findAll(attrs={'class':'star-rating'}): + r.extract() + for r in soup2.findAll(attrs={'class':'star-rating2'}): + r.extract() + pos = len(appendtag.contents) + appendtag.insert(pos, pagetext) + tag.extract() \ No newline at end of file diff --git a/recipes/ekologia_pl.recipe b/recipes/ekologia_pl.recipe index e925ebad6f..c053e6d5bc 100644 --- a/recipes/ekologia_pl.recipe +++ b/recipes/ekologia_pl.recipe @@ -9,13 +9,15 @@ class EkologiaPl(BasicNewsRecipe): language = 'pl' cover_url = 'http://www.ekologia.pl/assets/images/logo/ekologia_pl_223x69.png' ignore_duplicate_articles = {'title', 'url'} - extra_css = '.title {font-size: 200%;} .imagePowiazane, .imgCon {float:left; margin-right:5px;}' + extra_css = '.title {font-size: 200%;} .imagePowiazane {float:left; margin-right:5px; width: 200px;}' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_empty_feeds = True + remove_javascript = True use_embedded_content = False remove_attrs = ['style'] + keep_only_tags = [dict(attrs={'class':'contentParent'})] remove_tags = [dict(attrs={'class':['ekoLogo', 'powrocArt', 'butonDrukuj', 'widget-social-buttons']})] feeds = [(u'Wiadomo\u015bci', u'http://www.ekologia.pl/rss/20,53,0'), (u'\u015arodowisko', u'http://www.ekologia.pl/rss/20,56,0'), (u'Styl \u017cycia', u'http://www.ekologia.pl/rss/20,55,0')] diff --git a/recipes/gazeta_pl_bydgoszcz.recipe b/recipes/gazeta_pl_bydgoszcz.recipe new file mode 100644 index 0000000000..c0e9b265a8 --- /dev/null +++ b/recipes/gazeta_pl_bydgoszcz.recipe @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment +import re +class gw_bydgoszcz(BasicNewsRecipe): + title = u'Gazeta Wyborcza Bydgoszcz' + __author__ = 'fenuks' + language = 'pl' + description = 'Wiadomości z Bydgoszczy na portalu Gazeta.pl.' + category = 'newspaper' + publication_type = 'newspaper' + masthead_url = 'http://bi.gazeta.pl/im/3/4089/m4089863.gif' + INDEX = 'http://bydgoszcz.gazeta.pl' + cover_url = 'http://bi.gazeta.pl/i/hp/hp2009/logo.gif' + remove_empty_feeds = True + oldest_article = 3 + max_articles_per_feed = 100 + remove_javascript = True + no_stylesheets = True + use_embedded_content = False + ignore_duplicate_articles = {'title', 'url'} + + #rules for gazeta.pl + preprocess_regexps = [(re.compile(u'Czytaj więcej.*', re.DOTALL), lambda m: '