Fix #1093700 (Update recipe for Harper's magazine articles from printed edition)

This commit is contained in:
Kovid Goyal 2012-12-26 08:17:15 +05:30
parent abafe5c184
commit 55b8481131

View File

@ -1,18 +1,22 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>' __copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
''' '''
harpers.org - paid subscription/ printed issue articles harpers.org - paid subscription/ printed issue articles
This recipe only get's article's published in text format This recipe only get's article's published in text format
images and pdf's are ignored images and pdf's are ignored
If you have institutional subscription based on access IP you do not need to enter
anything in username/password fields
''' '''
import time
import urllib
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Harpers_full(BasicNewsRecipe): class Harpers_full(BasicNewsRecipe):
title = "Harper's Magazine - articles from printed edition" title = "Harper's Magazine - articles from printed edition"
__author__ = 'Darko Miletic' __author__ = 'Darko Miletic'
description = "Harper's Magazine: Founded June 1850." description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."
publisher = "Harpers's" publisher = "Harpers's"
category = 'news, politics, USA' category = 'news, politics, USA'
oldest_article = 30 oldest_article = 30
@ -21,52 +25,69 @@ class Harpers_full(BasicNewsRecipe):
use_embedded_content = False use_embedded_content = False
delay = 1 delay = 1
language = 'en' language = 'en'
needs_subscription = True encoding = 'utf8'
masthead_url = 'http://www.harpers.org/media/image/Harpers_305x100.gif' needs_subscription = 'optional'
masthead_url = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif'
publication_type = 'magazine' publication_type = 'magazine'
INDEX = strftime('http://www.harpers.org/archive/%Y/%m') INDEX = strftime('http://harpers.org/archive/%Y/%m')
LOGIN = 'http://www.harpers.org' LOGIN = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php'
cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') extra_css = """
extra_css = ' body{font-family: "Georgia",serif} ' body{font-family: adobe-caslon-pro,serif}
.category{font-size: small}
.articlePost p:first-letter{display: inline; font-size: xx-large; font-weight: bold}
"""
conversion_options = { conversion_options = {
'comment' : description 'comment' : description
, 'tags' : category , 'tags' : category
, 'publisher' : publisher , 'publisher' : publisher
, 'language' : language , 'language' : language
} }
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull','articlePost']}) ]
remove_tags = [ remove_tags = [
dict(name='table', attrs={'class':['rcnt','rcnt topline']}) dict(name='div', attrs={'class':'fRight rightDivPad'})
,dict(name='link') ,dict(name=['link','meta','object','embed','iframe'])
] ]
remove_attributes=['xmlns'] remove_attributes=['xmlns']
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
br.open('http://harpers.org/')
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open(self.LOGIN) tt = time.localtime()*1000
br.select_form(nr=1) data = urllib.urlencode({ 'm':self.username
br['handle' ] = self.username ,'p':self.password
br['password'] = self.password ,'rt':'http://harpers.org/'
br.submit() ,'tt':tt
})
br.open(self.LOGIN, data)
return br return br
def parse_index(self): def parse_index(self):
articles = [] articles = []
print 'Processing ' + self.INDEX print 'Processing ' + self.INDEX
soup = self.index_to_soup(self.INDEX) soup = self.index_to_soup(self.INDEX)
for item in soup.findAll('div', attrs={'class':'title'}): count = 0
text_link = item.parent.find('img',attrs={'alt':'Text'}) for item in soup.findAll('div', attrs={'class':'articleData'}):
if text_link: text_links = item.findAll('h2')
url = self.LOGIN + item.a['href'] for text_link in text_links:
title = item.a.contents[0] if count == 0:
date = strftime(' %B %Y') lcover_url = item.find(attrs={'class':'dwpdf'})
articles.append({ if lcover_url:
'title' :title self.cover_url = lcover_url.a['href']
,'date' :date count = 1
,'url' :url else:
,'description':'' url = text_link.a['href']
}) title = text_link.a.contents[0]
date = strftime(' %B %Y')
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
return [(soup.head.title.string, articles)] return [(soup.head.title.string, articles)]
def print_version(self, url):
return url + '?single=1'