Updated Kopalnia Wiedzy

2025-08-30 23:00:21 -04:00 · 2011-09-09 02:56:25 -06:00 · 2011-09-09 02:56:25 -06:00 · 06960ff7a6
commit 06960ff7a6
parent 7a97190b79
1 changed files with 14 additions and 15 deletions
--- a/recipes/kopalniawiedzy.recipe
+++ b/recipes/kopalniawiedzy.recipe
@ -1,4 +1,3 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2011, Attis <attis@attis.one.pl>'
 __version__ = 'v. 0.1'
@ -16,21 +15,21 @@ class KopalniaWiedzy(BasicNewsRecipe):
 		oldest_article = 7
 		max_articles_per_feed = 100
 		INDEX          = u'http://kopalniawiedzy.pl/'
-		remove_javascript     = True
+		remove_javascript     = True    
 		no_stylesheets        = True
-
+		
-		remove_tags    = [{'name':'p', 'attrs': {'class': 'keywords'} }]
+		remove_tags    = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
 		remove_tags_after = dict(attrs={'class':'ad-square'})
 		keep_only_tags    = [dict(name="div", attrs={'id':'articleContent'})]
 		extra_css      = '.topimage {margin-top: 30px}'
-
+		
 		preprocess_regexps = [
 				(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
 				lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
 				(re.compile(u'<br  /><br  />'),
 				lambda match: '<br\/>')
 			]
-
+		
 		feeds = [
 			(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
 			(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
@ -39,10 +38,10 @@ class KopalniaWiedzy(BasicNewsRecipe):
 			(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
 			(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
 		]
-
+		
 		def is_link_wanted(self, url, tag):
 			return tag['class'] == 'next'
-
+			
 		def remove_beyond(self, tag, next):
 				while tag is not None and getattr(tag, 'name', None) != 'body':
 						after = getattr(tag, next)
@ -51,30 +50,30 @@ class KopalniaWiedzy(BasicNewsRecipe):
 								after.extract()
 								after = ns
 						tag = tag.parent
-
+		
 		def append_page(self, soup, appendtag, position):
 				pager = soup.find('a',attrs={'class':'next'})
 				if pager:
 					nexturl = self.INDEX + pager['href']
 					soup2 = self.index_to_soup(nexturl)
 					texttag = soup2.find('div', attrs={'id':'articleContent'})
-
+					
 					tag = texttag.find(attrs={'class':'pages'})
 					self.remove_beyond(tag, 'nextSibling')
-
+					
 					newpos = len(texttag.contents)
 					self.append_page(soup2,texttag,newpos)
 					appendtag.insert(position,texttag)
-		def preprocess_html(self, soup):
+		def preprocess_html(self, soup): 
 				self.append_page(soup, soup.body, 3)
-
+				
 				for item in soup.findAll('div',attrs={'class':'pages'}):
 					item.extract()
-
+					
 				for item in soup.findAll('p', attrs={'class':'wykop'}):
 					item.extract()
-
+					
 				return soup