Korespondent and Kopalnia Wiedzy by Attis

2025-07-09 03:04:10 -04:00 · 2011-02-05 13:37:42 -07:00 · 2011-02-05 13:37:42 -07:00 · d6d4f9d444
commit d6d4f9d444
parent bcb1b938e5
4 changed files with 120 additions and 0 deletions
--- a/resources/images/news/kopalniawiedzy.png
+++ b/resources/images/news/kopalniawiedzy.png
--- a/resources/images/news/korespondent.png
+++ b/resources/images/news/korespondent.png
--- a/resources/recipes/kopalniawiedzy.recipe
+++ b/resources/recipes/kopalniawiedzy.recipe
@ -0,0 +1,80 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2011, Attis <attis@attis.one.pl>'
 __version__ = 'v. 0.1'
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class KopalniaWiedzy(BasicNewsRecipe):
 		title          = u'Kopalnia Wiedzy'
 		publisher      = u'Kopalnia Wiedzy'
 		description    = u'Ciekawostki ze świata nauki i techniki'
 		encoding       = 'utf-8'
 		__author__     = 'Attis'
 		language       = 'pl'
 		oldest_article = 7
 		max_articles_per_feed = 100
 		INDEX          = u'http://kopalniawiedzy.pl/'
 		remove_javascript     = True
 		no_stylesheets        = True
 		remove_tags    = [{'name':'p', 'attrs': {'class': 'keywords'} }]
 		remove_tags_after = dict(attrs={'class':'ad-square'})
 		keep_only_tags    = [dict(name="div", attrs={'id':'articleContent'})]
 		extra_css      = '.topimage {margin-top: 30px}'
 		preprocess_regexps = [
 				(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
 				lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
 				(re.compile(u'<br  /><br  />'),
 				lambda match: '<br\/>')
 			]
 		feeds = [
 			(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
 			(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
 			(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
 			(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
 			(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
 			(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
 		]
 		def is_link_wanted(self, url, tag):
 			return tag['class'] == 'next'
 		def remove_beyond(self, tag, next):
 				while tag is not None and getattr(tag, 'name', None) != 'body':
 						after = getattr(tag, next)
 						while after is not None:
 								ns = getattr(tag, next)
 								after.extract()
 								after = ns
 						tag = tag.parent
 		def append_page(self, soup, appendtag, position):
 				pager = soup.find('a',attrs={'class':'next'})
 				if pager:
 					nexturl = self.INDEX + pager['href']
 					soup2 = self.index_to_soup(nexturl)
 					texttag = soup2.find('div', attrs={'id':'articleContent'})
 					tag = texttag.find(attrs={'class':'pages'})
 					self.remove_beyond(tag, 'nextSibling')
 					newpos = len(texttag.contents)
 					self.append_page(soup2,texttag,newpos)
 					appendtag.insert(position,texttag)
 		def preprocess_html(self, soup):
 				self.append_page(soup, soup.body, 3)
 				for item in soup.findAll('div',attrs={'class':'pages'}):
 					item.extract()
 				for item in soup.findAll('p', attrs={'class':'wykop'}):
 					item.extract()
 				return soup
--- a/resources/recipes/korespondent.recipe
+++ b/resources/recipes/korespondent.recipe
@ -0,0 +1,40 @@
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2011, Attis <attis@attis.one.pl>'
 __version__ = 'v. 0.1'
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class KorespondentPL(BasicNewsRecipe):
    title          = u'Korespondent.pl'
    publisher      = u'Korespondent.pl'
    description    = u'Centrum wolnorynkowe - serwis ludzi wolnych'
    encoding       = 'utf-8'
    __author__     = 'Attis'
    language       = 'pl'
    oldest_article = 15
    max_articles_per_feed = 100
    remove_javascript     = True
    no_stylesheets        = True
    keep_only_tags = [dict(name='div', attrs={'class':'publicystyka'})]
    remove_tags    = [{'name': 'meta'}, {'name':'div', 'attrs': {'class': 'zdjecie'} }]
    extra_css      = '.naglowek {font-size: small}\n .tytul {font-size: x-large; padding-bottom: 10px; padding-top: 30px} \n .external {font-size: small}'
    preprocess_regexps = [
 			(re.compile(u'<a href="index\.php.*>(.*)</a>'),
 			lambda match: match.group(1) ),
 			(re.compile(u'<i>'),
 			lambda match:'<i class="external">' ),
 			(re.compile(u'<p></p>Więcej'),
 			lambda match:'Więcej' ),
 			(re.compile(u'target="_blank"'),
 			lambda match:'target="_blank" class="external"' ),
 			(re.compile(u'<p align="center">\nPoczytaj inne teksty w <a href="http://www.korespondent.pl">Serwisie wolnorynkowym Korespondent.pl</a>.*</body>', re.DOTALL|re.IGNORECASE),
 			lambda match: '</div></body>'),
 			]
    feeds = [(u'Serwis informacyjny', u'http://korespondent.pl/rss.xml')]