From 55b84811319ff0bbec7cf8a929e9611db0fb3825 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 26 Dec 2012 08:17:15 +0530 Subject: [PATCH] Fix #1093700 (Update recipe for Harper's magazine articles from printed edition) --- recipes/harpers_full.recipe | 89 +++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/recipes/harpers_full.recipe b/recipes/harpers_full.recipe index ff558e9c5b..b965bca9b8 100644 --- a/recipes/harpers_full.recipe +++ b/recipes/harpers_full.recipe @@ -1,18 +1,22 @@ __license__ = 'GPL v3' -__copyright__ = '2008-2010, Darko Miletic ' +__copyright__ = '2008-2012, Darko Miletic ' ''' harpers.org - paid subscription/ printed issue articles This recipe only get's article's published in text format images and pdf's are ignored +If you have institutional subscription based on access IP you do not need to enter +anything in username/password fields ''' +import time +import urllib from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Harpers_full(BasicNewsRecipe): title = "Harper's Magazine - articles from printed edition" __author__ = 'Darko Miletic' - description = "Harper's Magazine: Founded June 1850." + description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index." publisher = "Harpers's" category = 'news, politics, USA' oldest_article = 30 @@ -21,52 +25,69 @@ class Harpers_full(BasicNewsRecipe): use_embedded_content = False delay = 1 language = 'en' - needs_subscription = True - masthead_url = 'http://www.harpers.org/media/image/Harpers_305x100.gif' - publication_type = 'magazine' - INDEX = strftime('http://www.harpers.org/archive/%Y/%m') - LOGIN = 'http://www.harpers.org' - cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') - extra_css = ' body{font-family: "Georgia",serif} ' + encoding = 'utf8' + needs_subscription = 'optional' + masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif' + publication_type = 'magazine' + INDEX = strftime('http://harpers.org/archive/%Y/%m') + LOGIN = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php' + extra_css = """ + body{font-family: adobe-caslon-pro,serif} + .category{font-size: small} + .articlePost p:first-letter{display: inline; font-size: xx-large; font-weight: bold} + """ conversion_options = { - 'comment' : description - , 'tags' : category - , 'publisher' : publisher - , 'language' : language + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : language } - keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] + keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull','articlePost']}) ] remove_tags = [ - dict(name='table', attrs={'class':['rcnt','rcnt topline']}) - ,dict(name='link') + dict(name='div', attrs={'class':'fRight rightDivPad'}) + ,dict(name=['link','meta','object','embed','iframe']) ] - remove_attributes=['xmlns'] + remove_attributes=['xmlns'] def get_browser(self): br = BasicNewsRecipe.get_browser() + br.open('http://harpers.org/') if self.username is not None and self.password is not None: - br.open(self.LOGIN) - br.select_form(nr=1) - br['handle' ] = self.username - br['password'] = self.password - br.submit() + tt = time.localtime()*1000 + data = urllib.urlencode({ 'm':self.username + ,'p':self.password + ,'rt':'http://harpers.org/' + ,'tt':tt + }) + br.open(self.LOGIN, data) return br def parse_index(self): articles = [] print 'Processing ' + self.INDEX soup = self.index_to_soup(self.INDEX) - for item in soup.findAll('div', attrs={'class':'title'}): - text_link = item.parent.find('img',attrs={'alt':'Text'}) - if text_link: - url = self.LOGIN + item.a['href'] - title = item.a.contents[0] - date = strftime(' %B %Y') - articles.append({ - 'title' :title - ,'date' :date - ,'url' :url - ,'description':'' - }) + count = 0 + for item in soup.findAll('div', attrs={'class':'articleData'}): + text_links = item.findAll('h2') + for text_link in text_links: + if count == 0: + lcover_url = item.find(attrs={'class':'dwpdf'}) + if lcover_url: + self.cover_url = lcover_url.a['href'] + count = 1 + else: + url = text_link.a['href'] + title = text_link.a.contents[0] + date = strftime(' %B %Y') + articles.append({ + 'title' :title + ,'date' :date + ,'url' :url + ,'description':'' + }) return [(soup.head.title.string, articles)] + + def print_version(self, url): + return url + '?single=1'