calibre/resources/recipes/newsweek_polska.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2010, Mateusz Kielar, matek09@gmail.com'

from calibre.web.feeds.news import BasicNewsRecipe

class Newsweek(BasicNewsRecipe):
	EDITION = 0

	title = u'Newsweek Polska'
	__author__ = 'Mateusz Kielar'
	description = 'Weekly magazine'
	encoding = 'utf-8'
	no_stylesheets = True
	language = 'en'
	remove_javascript = True

	keep_only_tags =[]
	keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}))

	remove_tags =[]
	remove_tags.append(dict(name = 'div', attrs = {'class' : 'copy'}))
	remove_tags.append(dict(name = 'div', attrs = {'class' : 'url'}))

	extra_css = '''
					.body {font-size: small}
					.author {font-size: x-small}
					.lead {font-size: x-small}
					.title{font-size: x-large; font-weight: bold}
					'''

	def print_version(self, url):
		return url.replace("http://www.newsweek.pl/artykuly/wydanie/" + str(self.EDITION), "http://www.newsweek.pl/artykuly") + '/print'

	def find_last_full_issue(self):
		page = self.index_to_soup('http://www.newsweek.pl/Frames/IssueCover.aspx')
		issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
		page = self.index_to_soup(issue)
		issue = 'http://www.newsweek.pl/Frames/' + page.find(lambda tag: tag.name == 'span' and not tag.attrs).a['href']
		page = self.index_to_soup(issue)
		self.EDITION = page.find('a', attrs={'target' : '_parent'})['href'].replace('/wydania/','')

	def parse_index(self):
		self.find_last_full_issue()
		soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + str(self.EDITION))
		img = soup.find('img', id="ctl00_C1_PaperIsssueView_IssueImage", src=True)
		self.cover_url = img['src']
		feeds = []
		parent = soup.find(id='content-left-big')
		for txt in parent.findAll(attrs={'class':'txt_normal_red strong'}):
			section = self.tag_to_string(txt).capitalize()
			articles = list(self.find_articles(txt))
			feeds.append((section, articles))
		return feeds

	def find_articles(self, txt):
		for a in txt.findAllNext( attrs={'class':['strong','hr']}):
			if a.name in "div":
				break
			yield {
				'title' : self.tag_to_string(a),
				'url'   : 'http://www.newsweek.pl'+a['href'],
				'date'  : '',
				'description' : ''
				}