calibre/recipes/newsweek_polska.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com'

from calibre.web.feeds.news import BasicNewsRecipe

class Newsweek(BasicNewsRecipe):
	FIND_LAST_FULL_ISSUE = True
	EDITION = '0'
	EXCLUDE_LOCKED = True
	LOCKED_ICO = 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'

	title = u'Newsweek Polska'
	__author__ = 'matek09'
	description = 'Weekly magazine'
	encoding = 'utf-8'
	no_stylesheets = True
	language = 'pl'
	remove_javascript = True

	keep_only_tags =[]
	keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}))

	remove_tags =[]
	remove_tags.append(dict(name = 'div', attrs = {'class' : 'copy'}))
	remove_tags.append(dict(name = 'div', attrs = {'class' : 'url'}))

	extra_css = '''
					.body {font-size: small}
					.author {font-size: x-small}
					.lead {font-size: x-small}
					.title{font-size: x-large; font-weight: bold}
					'''

	def print_version(self, url):
		return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print'

	def is_locked(self, a):
		if a.findNext('img')['src'] == 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif':
			return True
		else:
			return False

	def is_full(self, issue_soup):
		if len(issue_soup.findAll('img', attrs={'src' : 'http://www.newsweek.pl/bins/media/static/newsweek/img/ico_locked.gif'})) > 1:
			return False
		else:
			return True

	def find_last_full_issue(self):
		frame_url = 'http://www.newsweek.pl/Frames/IssueCover.aspx'
		while True:
			frame_soup = self.index_to_soup(frame_url)
			self.EDITION = frame_soup.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')
			issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
			if self.is_full(issue_soup):
				break
			frame_url = 'http://www.newsweek.pl/Frames/' + frame_soup.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']


	def parse_index(self):
		if self.FIND_LAST_FULL_ISSUE:
			self.find_last_full_issue()
		soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
		img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True)
		self.cover_url = img['src']
		feeds = []
		parent = soup.find(id='content-left-big')
		for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}):
			articles = list(self.find_articles(txt))
			if len(articles) > 0:
				section = self.tag_to_string(txt).capitalize()
				feeds.append((section, articles))
		return feeds

	def find_articles(self, txt):
		for a in txt.findAllNext( attrs={'class':['strong','hr']}):
			if a.name in "div":
				break
			if (not self.FIND_LAST_FULL_ISSUE) & self.EXCLUDE_LOCKED & self.is_locked(a):
				continue
			yield {
				'title' : self.tag_to_string(a),
				'url'   : 'http://www.newsweek.pl' + a['href'],
				'date'  : '',
				'description' : ''
				}