diff --git a/recipes/cnd_weekly.recipe b/recipes/cnd_weekly.recipe new file mode 100644 index 0000000000..21839ae110 --- /dev/null +++ b/recipes/cnd_weekly.recipe @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2010, Derek Liang ' +''' +cnd.org +''' +import re + +from calibre.web.feeds.news import BasicNewsRecipe + +class TheCND(BasicNewsRecipe): + + title = 'CND Weekly' + __author__ = 'Derek Liang' + description = '' + INDEX = 'http://cnd.org' + language = 'zh' + conversion_options = {'linearize_tables':True} + + remove_tags_before = dict(name='div', id='articleHead') + remove_tags_after = dict(id='copyright') + remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})] + no_stylesheets = True + + preprocess_regexps = [ (re.compile(r'', re.DOTALL), lambda m: ''), + (re.compile('', re.DOTALL), lambda m: ''), + ] + + def print_version(self, url): + if url.find('news/article.php') >= 0: + return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url) + else: + return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url) + + def parse_index(self): + soup = self.index_to_soup(self.INDEX) + + feeds = [] + articles = {} + + for a in soup.findAll('a', attrs={'target':'_cnd'}): + url = a['href'] + if url.find('article.php') < 0 : + continue + if url.startswith('/'): + url = 'http://cnd.org'+url + title = self.tag_to_string(a) + date = a.nextSibling + if not re.search('cm', date): + continue + self.log('\tFound article: ', title, 'at', url, '@', date) + if (date is not None) and len(date)>2: + if not articles.has_key(date): + articles[date] = [] + articles[date].append({'title':title, 'url':url, 'description': '', 'date':''}) + self.log('\t\tAppend to : ', date) + + + sorted_articles = sorted(articles) + while sorted_articles: + mostCurrent = sorted_articles.pop() + self.title = 'CND ' + mostCurrent + feeds.append((self.title, articles[mostCurrent])) + + return feeds + + def populate_article_metadata(self, article, soup, first): + header = soup.find('h3') + self.log('header: ' + self.tag_to_string(header)) + pass +