diff --git a/recipes/odkrywcy_pl.recipe b/recipes/odkrywcy_pl.recipe new file mode 100644 index 0000000000..c7861be227 --- /dev/null +++ b/recipes/odkrywcy_pl.recipe @@ -0,0 +1,102 @@ +__license__ = 'GPL v3' +import re +import datetime +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Comment + +class Odkrywcy(BasicNewsRecipe): + title = u'Odkrywcy.pl' + __author__ = 'fenuks' + description = u'' + #publication_type = '' + language = 'pl' + #encoding = '' + extra_css = 'img {display: block;}' + cover_url = '' + #masthead_url = '' + INDEX = 'http://odkrywcy.pl' + use_embedded_content = False + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + keep_only_tags = [dict(attrs={'class':'content'})] + remove_tags = [dict(name='a', attrs={'href':['#opOpinie', '#opinie']}), dict(attrs={'class':['fr', 'clra', 'close', 'wpsocial-fbFanpageBox', 'tagi', 'test']}), dict(id=['rekSrd05', 'moreTopNews']), dict(name='img', attrs={'class':'zr'}), dict(name='img', attrs={'alt':u'Następne'})] + remove_tags_after = dict(id='aTxt') + #remove_tags_before = dict() + feeds = [(u'', '')] + + def find_articles(self, url): + articles = [] + soup = self.index_to_soup(url) + for i in soup.findAll(attrs={'class':'katZj clra'}): + tmp = i.find('small') + datestring = re.search('dodano: (\d{4}-\d{2}-\d{2})', tmp.string).group(1) + d = datetime.datetime.strptime(datestring, "%Y-%m-%d").date() + if (datetime.datetime.now().date() - d).days > self.oldest_article: + continue + tmp = i.find('a') + title = tmp.string + url = self.INDEX + tmp['href'] + articles.append({'title' : title, + 'url' : url, + 'date' : '', + 'description' : '' + }) + return articles + + def parse_index(self): + feeds = [] + feeds.append((u'Człowiek', self.find_articles('http://odkrywcy.pl/kat,111396,name,Czlowiek,kategoria.html'))) + feeds.append((u'Technologie', self.find_articles('http://odkrywcy.pl/kat,111398,name,Technologie,kategoria.html'))) + feeds.append((u'Ekologia', self.find_articles('http://odkrywcy.pl/kat,111400,name,Ekologia,kategoria.html'))) + feeds.append((u'Kosmos', self.find_articles('http://odkrywcy.pl/kat,111402,name,Kosmos,kategoria.html'))) + feeds.append((u'Cywilizacja', self.find_articles('http://odkrywcy.pl/kat,111404,name,Cywilizacja,kategoria.html'))) + feeds.append((u'Przyroda', self.find_articles('http://odkrywcy.pl/kat,111406,name,Przyroda,kategoria.html'))) + feeds.append((u'Fizyka i chemia', self.find_articles('http://odkrywcy.pl/kat,111408,name,Fizyka,kategoria.html'))) + feeds.append((u'Historia', self.find_articles('http://odkrywcy.pl/kat,122994,name,Historia,kategoria.html'))) + feeds.append((u'Media', self.find_articles('http://odkrywcy.pl/kat,116794,name,Media,media.html'))) + + return feeds + + def append_page(self, soup, appendtag): + tag = soup.find('a', attrs={'class': 'btnNext'}) + urls = [] + while tag is not None: + if tag['href'] in urls: + break + urls.append(tag['href']) + soup2 = self.index_to_soup(self.INDEX + tag['href']) + tag = soup2.find(name='a', attrs={'class': 'btnNext'}) + pagetext = soup2.findAll(attrs={'class':'content'}) + for container in pagetext: + header = container.find(name='h1') + if header: + header.extract() + for comment in container.findAll(text=lambda text:isinstance(text, Comment)): + comment.extract() + for container in pagetext: + pos = len(appendtag.contents) + appendtag.insert(pos, container) + for r in appendtag.findAll(attrs={'class':'galStr'}): + r.extract() + for r in appendtag.findAll(attrs={'alt':'Następne'}): + r.extract() + for r in appendtag.findAll(attrs={'alt':'Poprzednie'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'clra'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'close'}): + r.extract() + for r in appendtag.findAll(attrs={'class':'tagi'}): + r.extract() + for r in appendtag.findAll(attrs={'id':'moreTopNews'}): + r.extract() + + def preprocess_html(self, soup): + self.append_page(soup, soup.body) + return soup \ No newline at end of file diff --git a/recipes/znadplanszy_pl.recipe b/recipes/znadplanszy_pl.recipe new file mode 100644 index 0000000000..0c34f2a307 --- /dev/null +++ b/recipes/znadplanszy_pl.recipe @@ -0,0 +1,27 @@ +__license__ = 'GPL v3' +from calibre.web.feeds.news import BasicNewsRecipe + +class ZnadPlanszy(BasicNewsRecipe): + title = u'ZnadPlanszy.pl' + __author__ = 'fenuks' + description = u'' + #publication_type = '' + language = 'pl' + #encoding = '' + #extra_css = '' + cover_url = 'http://znadplanszy.pl/wp-content/uploads/2013/05/logo-znadplanszy.png' + #masthead_url = '' + use_embedded_content = False + oldest_article = 14 + max_articles_per_feed = 100 + no_stylesheets = True + remove_empty_feeds = True + remove_javascript = True + remove_attributes = ['style', 'font'] + ignore_duplicate_articles = {'title', 'url'} + + #keep_only_tags = [dict()] + remove_tags = [dict(attrs={'class':'rounded-container'})] + remove_tags_after = dict(attrs={'id':'dotEPUBcontent'}) + remove_tags_before = dict(attrs={'class':'content units nine alpha'}) + feeds = [(u'Wszystkie', 'http://znadplanszy.pl/full-feed/posts/')] \ No newline at end of file