Korespondent and Kopalnia Wiedzy by Attis

2026-01-06 20:20:30 -05:00 · 2011-02-05 13:37:42 -07:00 · 2011-02-05 13:37:42 -07:00 · d6d4f9d444
commit d6d4f9d444
parent bcb1b938e5
4 changed files with 120 additions and 0 deletions
--- a/resources/images/news/kopalniawiedzy.png
+++ b/resources/images/news/kopalniawiedzy.png
--- a/resources/images/news/korespondent.png
+++ b/resources/images/news/korespondent.png
--- a/resources/recipes/kopalniawiedzy.recipe
+++ b/resources/recipes/kopalniawiedzy.recipe
@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+__license__   = 'GPL v3'
+__copyright__ = '2011, Attis <attis@attis.one.pl>'
+__version__ = 'v. 0.1'
+
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class KopalniaWiedzy(BasicNewsRecipe):
+		title          = u'Kopalnia Wiedzy'
+		publisher      = u'Kopalnia Wiedzy'
+		description    = u'Ciekawostki ze świata nauki i techniki'
+		encoding       = 'utf-8'
+		__author__     = 'Attis'
+		language       = 'pl'
+		oldest_article = 7
+		max_articles_per_feed = 100
+		INDEX          = u'http://kopalniawiedzy.pl/'
+		remove_javascript     = True
+		no_stylesheets        = True
+
+		remove_tags    = [{'name':'p', 'attrs': {'class': 'keywords'} }]
+		remove_tags_after = dict(attrs={'class':'ad-square'})
+		keep_only_tags    = [dict(name="div", attrs={'id':'articleContent'})]
+		extra_css      = '.topimage {margin-top: 30px}'
+
+		preprocess_regexps = [
+				(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
+				lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
+				(re.compile(u'<br  /><br  />'),
+				lambda match: '<br\/>')
+			]
+
+		feeds = [
+			(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
+			(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
+			(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
+			(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
+			(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
+			(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
+		]
+
+		def is_link_wanted(self, url, tag):
+			return tag['class'] == 'next'
+
+		def remove_beyond(self, tag, next):
+				while tag is not None and getattr(tag, 'name', None) != 'body':
+						after = getattr(tag, next)
+						while after is not None:
+								ns = getattr(tag, next)
+								after.extract()
+								after = ns
+						tag = tag.parent
+
+		def append_page(self, soup, appendtag, position):
+				pager = soup.find('a',attrs={'class':'next'})
+				if pager:
+					nexturl = self.INDEX + pager['href']
+					soup2 = self.index_to_soup(nexturl)
+					texttag = soup2.find('div', attrs={'id':'articleContent'})
+
+					tag = texttag.find(attrs={'class':'pages'})
+					self.remove_beyond(tag, 'nextSibling')
+
+					newpos = len(texttag.contents)
+					self.append_page(soup2,texttag,newpos)
+
+					appendtag.insert(position,texttag)
+
+
+		def preprocess_html(self, soup):
+				self.append_page(soup, soup.body, 3)
+
+				for item in soup.findAll('div',attrs={'class':'pages'}):
+					item.extract()
+
+				for item in soup.findAll('p', attrs={'class':'wykop'}):
+					item.extract()
+
+				return soup
--- a/resources/recipes/korespondent.recipe
+++ b/resources/recipes/korespondent.recipe
@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Attis <attis@attis.one.pl>'
+__version__ = 'v. 0.1'
+
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class KorespondentPL(BasicNewsRecipe):
+    title          = u'Korespondent.pl'
+    publisher      = u'Korespondent.pl'
+    description    = u'Centrum wolnorynkowe - serwis ludzi wolnych'
+    encoding       = 'utf-8'
+    __author__     = 'Attis'
+    language       = 'pl'
+    oldest_article = 15
+    max_articles_per_feed = 100
+    remove_javascript     = True
+    no_stylesheets        = True
+
+    keep_only_tags = [dict(name='div', attrs={'class':'publicystyka'})]
+    remove_tags    = [{'name': 'meta'}, {'name':'div', 'attrs': {'class': 'zdjecie'} }]
+    extra_css      = '.naglowek {font-size: small}\n .tytul {font-size: x-large; padding-bottom: 10px; padding-top: 30px} \n .external {font-size: small}'
+
+    preprocess_regexps = [
+			(re.compile(u'<a href="index\.php.*>(.*)</a>'),
+			lambda match: match.group(1) ),
+			(re.compile(u'<i>'),
+			lambda match:'<i class="external">' ),
+			(re.compile(u'<p></p>Więcej'),
+			lambda match:'Więcej' ),
+			(re.compile(u'target="_blank"'),
+			lambda match:'target="_blank" class="external"' ),
+			(re.compile(u'<p align="center">\nPoczytaj inne teksty w <a href="http://www.korespondent.pl">Serwisie wolnorynkowym Korespondent.pl</a>.*</body>', re.DOTALL|re.IGNORECASE),
+			lambda match: '</div></body>'),
+			]
+
+    feeds = [(u'Serwis informacyjny', u'http://korespondent.pl/rss.xml')]
+