From 7a2e4a43e612d1934b8b2e5e50a5d30c65734678 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 9 Jan 2010 10:07:34 -0700 Subject: [PATCH] New recipe for The Escapist by Lorenzo Vigentini --- resources/recipes/freenature.recipe | 17 +------- resources/recipes/nzherald.recipe | 17 ++++---- resources/recipes/the_escapist.recipe | 59 +++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 25 deletions(-) create mode 100644 resources/recipes/the_escapist.recipe diff --git a/resources/recipes/freenature.recipe b/resources/recipes/freenature.recipe index c797053d0d..52c49d6502 100644 --- a/resources/recipes/freenature.recipe +++ b/resources/recipes/freenature.recipe @@ -1,5 +1,4 @@ from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup import re class NatureNews(BasicNewsRecipe): @@ -18,7 +17,7 @@ class NatureNews(BasicNewsRecipe): #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content']}), #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}), dict(name='h2', attrs={'id':'comments'}), - dict(name='ul', attrs={'class':'toolsmenu xoxo'}), + dict(name='ul', attrs={'class':'toolsmenu xoxo'}), ] preprocess_regexps = [ @@ -26,19 +25,7 @@ class NatureNews(BasicNewsRecipe): ] feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')] - + def get_article_url(self, article): return article.get('id') - #def preprocess_html(self, soup): - #story = soup.find(name='div', attrs={'id':'contentColumn'}) - #td = heading.findParent(name='td') - #td.extract() - #soup = BeautifulSoup('t') - #body = soup.find(name='body') - #body.insert(0, story) - #for x in soup.findAll(name='p', text=lambda x:x and '-->' in x): - #p = x.findParent('p') - #if p is not None: - #p.extract() - #return soup diff --git a/resources/recipes/nzherald.recipe b/resources/recipes/nzherald.recipe index 92572a58bc..08f66e2f56 100644 --- a/resources/recipes/nzherald.recipe +++ b/resources/recipes/nzherald.recipe @@ -1,7 +1,4 @@ -import string, re -from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class NewZealandHerald(BasicNewsRecipe): @@ -9,7 +6,7 @@ class NewZealandHerald(BasicNewsRecipe): __author__ = 'Krittika Goyal' description = 'Daily news' timefmt = ' [%d %b, %Y]' - + no_stylesheets = True remove_tags_before = dict(name='div', attrs={'class':'contentContainer left eight'}) remove_tags_after = dict(name='div', attrs={'class':'callToAction'}) @@ -18,8 +15,8 @@ class NewZealandHerald(BasicNewsRecipe): dict(name='div', attrs={'class':['sectionHeader', 'tools','callToAction', 'contentContainer right two nopad relatedColumn']}), #dict(name='div', attrs={'id':['shareContainer']}), #dict(name='form', attrs={'onsubmit':"return verifySearch(this.w,'Keyword, citation, or #author')"}), - #dict(name='table', attrs={'cellspacing':'0'}), - ] + #dict(name='table', attrs={'cellspacing':'0'}), + ] def preprocess_html(self, soup): table = soup.find('table') @@ -32,7 +29,7 @@ class NewZealandHerald(BasicNewsRecipe): soup = self.index_to_soup(url) div = soup.find(attrs={'class':'col-300 categoryList'}) date = div.find(attrs={'class':'link-list-heading'}) - + current_articles = [] for x in date.findAllNext(attrs={'class':['linkList', 'link-list-heading']}): if x.get('class') == 'link-list-heading': break @@ -52,8 +49,8 @@ class NewZealandHerald(BasicNewsRecipe): 'description':'', 'date':''}) return current_articles - - + + # To GET SECTIONS def parse_index(self): feeds = [] @@ -67,7 +64,7 @@ class NewZealandHerald(BasicNewsRecipe): ('Crime', 'http://www.nzherald.co.nz/crime/news/headlines.cfm?c_id=30'), ('Environment', - 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'), + 'http://www.nzherald.co.nz/environment/news/headlines.cfm?c_id=39'), ]: articles = self.nz_parse_section(url) if articles: diff --git a/resources/recipes/the_escapist.recipe b/resources/recipes/the_escapist.recipe new file mode 100644 index 0000000000..e0bfba1ece --- /dev/null +++ b/resources/recipes/the_escapist.recipe @@ -0,0 +1,59 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__author__ = 'Lorenzo Vigentini' +__copyright__ = '2009, Lorenzo Vigentini ' +description = 'the Escapist Magazine - v1.02 (09, January 2010)' + +''' +http://www.escapistmagazine.com/ +''' + +from calibre.web.feeds.news import BasicNewsRecipe + +class al(BasicNewsRecipe): + author = 'Lorenzo Vigentini' + description = 'the Escapist Magazine' + + cover_url = 'http://cdn.themis-media.com/themes/escapistmagazine/default/images/logo.png' + title = u'the Escapist Magazine' + publisher = 'Themis media' + category = 'Video games news, lifestyle, gaming culture' + + language = 'en' + timefmt = '[%a, %d %b, %Y]' + + oldest_article = 1 + max_articles_per_feed = 100 + use_embedded_content = False + recursion = 10 + + remove_javascript = True + no_stylesheets = True + + feeds = [ + (u'Daily News', u'http://www.escapistmagazine.com/rss/news/0.xml'), + (u'Articles', u'http://www.escapistmagazine.com/rss/articles/0.xml') + ] + + def print_version(self,url): + baseURL='http://www.escapistmagazine.com' + segments = url.split('/') + #basename = '/'.join(segments[:3]) + '/' + subPath= '/'+ segments[3] + '/' + articleURL=(segments[len(segments)-1])[0:5] + + if articleURL[4] =='-': + articleURL=articleURL[:4] + + printVerString='print/'+ articleURL + s= baseURL + subPath + printVerString + return s + + keep_only_tags = [ + dict(name='div', attrs={'id':'article'}) + ] + + remove_tags = [ + dict(name='div',attrs={'id':['ad_leaderboard','print_notice','bottom_panel_container']}) + ] +