From a43274e55a4060bf864ecf1c8f54c64b0c3cee5f Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sun, 12 Dec 2010 12:56:52 +0900 Subject: [PATCH] recipe: add paper.li recipes --- resources/recipes/paperli.recipe | 58 +++++++++++++++++++++++++ resources/recipes/paperli_topic.recipe | 59 ++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 resources/recipes/paperli.recipe create mode 100644 resources/recipes/paperli_topic.recipe diff --git a/resources/recipes/paperli.recipe b/resources/recipes/paperli.recipe new file mode 100644 index 0000000000..2c99e5dc81 --- /dev/null +++ b/resources/recipes/paperli.recipe @@ -0,0 +1,58 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +paperli +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime +import re, sys + +class paperli(BasicNewsRecipe): +#-------------------please change here ---------------- + paperli_tag = 'osm' + title = u'The # osm Daily - paperli' +#------------------------------------------------------------- + base_url = 'http://paper.li' + index = '/tag/'+paperli_tag+'/~list' + + __author__ = 'Hiroshi Miura' + oldest_article = 7 + max_articles_per_feed = 100 + description = 'paper.li page' + publisher = 'paper.li' + category = 'paper.li' + language = 'en' + encoding = 'utf-8' + remove_javascript = True + timefmt = '[%y/%m/%d]' + + def parse_index(self): + feeds = [] + newsarticles = [] + topic = 'HEADLINE' + + #for pages + page = self.index + while True: + soup = self.index_to_soup(''.join([self.base_url,page])) + for itt in soup.findAll('div',attrs={'class':'yui-u'}): + itema = itt.find('a',href=True,attrs={'class':'ts'}) + if itema is not None: + itemd = itt.find('div',text=True, attrs={'class':'text'}) + newsarticles.append({ + 'title' :itema.string + ,'date' :strftime(self.timefmt) + ,'url' :itema['href'] + ,'description':itemd.string + }) + + nextpage = soup.find('div',attrs={'class':'pagination_top'}).find('li', attrs={'class':'next'}) + if nextpage is not None: + page = nextpage.find('a', href=True)['href'] + else: + break + + feeds.append((topic, newsarticles)) + return feeds + diff --git a/resources/recipes/paperli_topic.recipe b/resources/recipes/paperli_topic.recipe new file mode 100644 index 0000000000..3906af362f --- /dev/null +++ b/resources/recipes/paperli_topic.recipe @@ -0,0 +1,59 @@ +__license__ = 'GPL v3' +__copyright__ = '2010, Hiroshi Miura ' +''' +paperli +''' + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre import strftime +import re + +class paperli_topics(BasicNewsRecipe): +#-------------------please change here ---------------- + paperli_tag = 'wikileaks' + title = u'The # wikileaks Daily - paperli' +#------------------------------------------------------------- + __author__ = 'Hiroshi Miura' + oldest_article = 7 + max_articles_per_feed = 100 + description = 'paper.li page about '+ paperli_tag + publisher = 'paper.li' + category = 'paper.li' + language = 'en' + encoding = 'utf-8' + remove_javascript = True + masthead_title = u'The '+ paperli_tag +' Daily' + timefmt = '[%y/%m/%d]' + base_url = 'http://paper.li' + index = base_url+'/tag/'+paperli_tag + + + def parse_index(self): + + # get topics + topics = [] + soup = self.index_to_soup(self.index) + topics_lists = soup.find('div',attrs={'class':'paper-nav-bottom'}) + for item in topics_lists.findAll('li', attrs={'class':""}): + itema = item.find('a',href=True) + topics.append({'title': itema.string, 'url': itema['href']}) + + #get feeds + feeds = [] + for topic in topics: + newsarticles = [] + soup = self.index_to_soup(''.join([self.base_url, topic['url'] ])) + topstories = soup.findAll('div',attrs={'class':'yui-u'}) + for itt in topstories: + itema = itt.find('a',href=True,attrs={'class':'ts'}) + if itema is not None: + itemd = itt.find('div',text=True, attrs={'class':'text'}) + newsarticles.append({ + 'title' :itema.string + ,'date' :strftime(self.timefmt) + ,'url' :itema['href'] + ,'description':itemd.string + }) + feeds.append((topic['title'], newsarticles)) + return feeds +