calibre/recipes/cnd.recipe

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2010, Derek Liang <Derek.liang.ca @@@at@@@ gmail.com>'
'''
cnd.org
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe

class TheCND(BasicNewsRecipe):

	title	  = 'CND'
	__author__ = 'Derek Liang'
	description = ''
	INDEX = 'http://cnd.org'
	language = 'zh'
	conversion_options = {'linearize_tables':True}

	remove_tags_before = dict(name='div', id='articleHead')
	remove_tags_after  = dict(id='copyright')
	remove_tags		= [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
	no_stylesheets	 = True

	preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]

	def print_version(self, url):
		if url.find('news/article.php') >= 0:
			return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
		else:
			return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)

	def parse_index(self):
		soup = self.index_to_soup(self.INDEX)

		feeds = []
		articles = {}

		for a in soup.findAll('a', attrs={'target':'_cnd'}):
			url = a['href']
			if url.find('article.php') < 0 :
				continue
			if url.startswith('/'):
				url = 'http://cnd.org'+url
			title = self.tag_to_string(a)
			self.log('\tFound article: ', title, 'at', url)
			date = a.nextSibling
			if (date is not None) and len(date)>2:
				if not articles.has_key(date):
					articles[date] = []
				articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
				self.log('\t\tAppend to : ', date)

		self.log('log articles', articles)
		mostCurrent = sorted(articles).pop()
		self.title = 'CND ' + mostCurrent

		feeds.append((self.title, articles[mostCurrent]))

		return feeds

	def populate_article_metadata(self, article, soup, first):
		header = soup.find('h3')
		self.log('header: ' + self.tag_to_string(header))
		pass