diff --git a/recipes/kopalniawiedzy.recipe b/recipes/kopalniawiedzy.recipe index 628dc1b2d2..a7b932f618 100644 --- a/recipes/kopalniawiedzy.recipe +++ b/recipes/kopalniawiedzy.recipe @@ -1,79 +1,79 @@ __license__ = 'GPL v3' -__copyright__ = '2011, Attis ' +__copyright__ = '2011 Attis , 2012 Tomasz Długosz ' __version__ = 'v. 0.1' import re from calibre.web.feeds.recipes import BasicNewsRecipe class KopalniaWiedzy(BasicNewsRecipe): - title = u'Kopalnia Wiedzy' - publisher = u'Kopalnia Wiedzy' - description = u'Ciekawostki ze świata nauki i techniki' - encoding = 'utf-8' - __author__ = 'Attis' - language = 'pl' - oldest_article = 7 - max_articles_per_feed = 100 - INDEX = u'http://kopalniawiedzy.pl/' - remove_javascript = True - no_stylesheets = True - - remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}] - remove_tags_after = dict(attrs={'class':'ad-square'}) - keep_only_tags = [dict(name="div", attrs={'id':'articleContent'})] - extra_css = '.topimage {margin-top: 30px}' - - preprocess_regexps = [ - (re.compile(u''), - lambda match: '' ), - (re.compile(u'

'), - lambda match: '') - ] - - feeds = [ - (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), - (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), - (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), - (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), - (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), - (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') - ] - - def is_link_wanted(self, url, tag): - return tag['class'] == 'next' - - def remove_beyond(self, tag, next): - while tag is not None and getattr(tag, 'name', None) != 'body': - after = getattr(tag, next) - while after is not None: - ns = getattr(tag, next) - after.extract() - after = ns - tag = tag.parent - - def append_page(self, soup, appendtag, position): - pager = soup.find('a',attrs={'class':'next'}) - if pager: - nexturl = self.INDEX + pager['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'id':'articleContent'}) - - tag = texttag.find(attrs={'class':'pages'}) - self.remove_beyond(tag, 'nextSibling') - - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) + title = u'Kopalnia Wiedzy' + publisher = u'Kopalnia Wiedzy' + description = u'Ciekawostki ze świata nauki i techniki' + encoding = 'utf-8' + __author__ = 'Attis & Tomasz Długosz' + language = 'pl' + oldest_article = 7 + max_articles_per_feed = 100 + INDEX = u'http://kopalniawiedzy.pl/' + remove_javascript = True + no_stylesheets = True - appendtag.insert(position,texttag) + remove_tags = [{'name':'p', 'attrs': {'class': 'keywords'}}, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}, {'name':'div', 'attrs': {'class':'article-time-and-cat'}}, {'name':'p', 'attrs': {'class':'tags'}}] + remove_tags_after = dict(attrs={'class':'ad-square'}) + keep_only_tags = [dict(name="div", attrs={'class':'article-text text-small'})] + extra_css = '.topimage {margin-top: 30px}' + + preprocess_regexps = [ + (re.compile(u''), + lambda match: '' ), + (re.compile(u'

'), + lambda match: '') + ] + + feeds = [ + (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'), + (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'), + (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'), + (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'), + (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'), + (u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss') + ] + + def is_link_wanted(self, url, tag): + return tag['class'] == 'next' + + def remove_beyond(self, tag, next): + while tag is not None and getattr(tag, 'name', None) != 'body': + after = getattr(tag, next) + while after is not None: + ns = getattr(tag, next) + after.extract() + after = ns + tag = tag.parent + + def append_page(self, soup, appendtag, position): + pager = soup.find('a',attrs={'class':'next'}) + if pager: + nexturl = self.INDEX + pager['href'] + soup2 = self.index_to_soup(nexturl) + texttag = soup2.find('div', attrs={'id':'articleContent'}) + + tag = texttag.find(attrs={'class':'pages'}) + self.remove_beyond(tag, 'nextSibling') + + newpos = len(texttag.contents) + self.append_page(soup2,texttag,newpos) + + appendtag.insert(position,texttag) - def preprocess_html(self, soup): - self.append_page(soup, soup.body, 3) - - for item in soup.findAll('div',attrs={'class':'pages'}): - item.extract() - - for item in soup.findAll('p', attrs={'class':'wykop'}): - item.extract() - - return soup + def preprocess_html(self, soup): + self.append_page(soup, soup.body, 3) + + for item in soup.findAll('div',attrs={'class':'pages'}): + item.extract() + + for item in soup.findAll('p', attrs={'class':'wykop'}): + item.extract() + + return soup