calibre/recipes/kopalniawiedzy.recipe

__license__   = 'GPL v3'
__copyright__ = '2011, Attis <attis@attis.one.pl>'
__version__ = 'v. 0.1'

import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class KopalniaWiedzy(BasicNewsRecipe):
		title          = u'Kopalnia Wiedzy'
		publisher      = u'Kopalnia Wiedzy'
		description    = u'Ciekawostki ze świata nauki i techniki'
		encoding       = 'utf-8'
		__author__     = 'Attis'
		language       = 'pl'
		oldest_article = 7
		max_articles_per_feed = 100
		INDEX          = u'http://kopalniawiedzy.pl/'
		remove_javascript     = True
		no_stylesheets        = True

		remove_tags    = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
		remove_tags_after = dict(attrs={'class':'ad-square'})
		keep_only_tags    = [dict(name="div", attrs={'id':'articleContent'})]
		extra_css      = '.topimage {margin-top: 30px}'

		preprocess_regexps = [
				(re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
				lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
				(re.compile(u'<br  /><br  />'),
				lambda match: '<br\/>')
			]

		feeds = [
			(u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
			(u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
			(u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
			(u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
			(u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
			(u'Artykuły', u'http://kopalniawiedzy.pl/artykuly.rss')
		]

		def is_link_wanted(self, url, tag):
			return tag['class'] == 'next'

		def remove_beyond(self, tag, next):
				while tag is not None and getattr(tag, 'name', None) != 'body':
						after = getattr(tag, next)
						while after is not None:
								ns = getattr(tag, next)
								after.extract()
								after = ns
						tag = tag.parent

		def append_page(self, soup, appendtag, position):
				pager = soup.find('a',attrs={'class':'next'})
				if pager:
					nexturl = self.INDEX + pager['href']
					soup2 = self.index_to_soup(nexturl)
					texttag = soup2.find('div', attrs={'id':'articleContent'})

					tag = texttag.find(attrs={'class':'pages'})
					self.remove_beyond(tag, 'nextSibling')

					newpos = len(texttag.contents)
					self.append_page(soup2,texttag,newpos)

					appendtag.insert(position,texttag)


		def preprocess_html(self, soup):
				self.append_page(soup, soup.body, 3)

				for item in soup.findAll('div',attrs={'class':'pages'}):
					item.extract()

				for item in soup.findAll('p', attrs={'class':'wykop'}):
					item.extract()

				return soup