Fix #1093700 (Update recipe for Harper's magazine articles from printed edition)

2025-07-09 03:04:10 -04:00 · 2012-12-26 08:17:15 +05:30 · 2012-12-26 08:17:15 +05:30 · 55b8481131
commit 55b8481131
parent abafe5c184
1 changed files with 55 additions and 34 deletions
--- a/recipes/harpers_full.recipe
+++ b/recipes/harpers_full.recipe
@ -1,18 +1,22 @@
 __license__   = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2012, Darko Miletic <darko.miletic at gmail.com>'
 '''
 harpers.org - paid subscription/ printed issue articles
 This recipe only get's article's published in text format
 images and pdf's are ignored
 If you have institutional subscription based on access IP you do not need to enter
 anything in username/password fields
 '''
 import time
 import urllib
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
 class Harpers_full(BasicNewsRecipe):
    title                 = "Harper's Magazine - articles from printed edition"
    __author__            = 'Darko Miletic'
-    description           = "Harper's Magazine: Founded June 1850."
+    description           = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index."
    publisher             = "Harpers's"
    category              = 'news, politics, USA'
    oldest_article        = 30
@ -21,52 +25,69 @@ class Harpers_full(BasicNewsRecipe):
    use_embedded_content  = False
    delay                 = 1
    language              = 'en'
-    needs_subscription    = True
+    encoding              = 'utf8'
-    masthead_url          = 'http://www.harpers.org/media/image/Harpers_305x100.gif'
+    needs_subscription    = 'optional'
    masthead_url          = 'http://harpers.org/wp-content/themes/harpers/images/pheader.gif'
    publication_type      = 'magazine'
-    INDEX                 = strftime('http://www.harpers.org/archive/%Y/%m')
+    INDEX                 = strftime('http://harpers.org/archive/%Y/%m')
-    LOGIN                 = 'http://www.harpers.org'
+    LOGIN                 = 'http://harpers.org/wp-content/themes/harpers/ajax_login.php'
-    cover_url             = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
+    extra_css             = """
-    extra_css             = ' body{font-family: "Georgia",serif} '
+                                body{font-family: adobe-caslon-pro,serif}
                                .category{font-size: small}
                                .articlePost p:first-letter{display: inline; font-size: xx-large; font-weight: bold}
                            """
    conversion_options = {
-                          'comment'          : description
+                          'comment'   : description
-                        , 'tags'             : category
+                        , 'tags'      : category
-                        , 'publisher'        : publisher
+                        , 'publisher' : publisher
-                        , 'language'         : language
+                        , 'language'  : language
                        }
-    keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
+    keep_only_tags = [ dict(name='div', attrs={'class':['postdetailFull','articlePost']}) ]
    remove_tags = [
-                     dict(name='table', attrs={'class':['rcnt','rcnt topline']})
+                     dict(name='div', attrs={'class':'fRight rightDivPad'})
-                    ,dict(name='link')
+                    ,dict(name=['link','meta','object','embed','iframe'])
                  ]
    remove_attributes=['xmlns']
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://harpers.org/')
        if self.username is not None and self.password is not None:
-            br.open(self.LOGIN)
+            tt = time.localtime()*1000
-            br.select_form(nr=1)
+            data = urllib.urlencode({ 'm':self.username
-            br['handle'  ] = self.username
+                                     ,'p':self.password
-            br['password'] = self.password
+                                     ,'rt':'http://harpers.org/'
-            br.submit()
+                                     ,'tt':tt
                                   })
            br.open(self.LOGIN, data)
        return br
    def parse_index(self):
        articles = []
        print 'Processing ' + self.INDEX
        soup = self.index_to_soup(self.INDEX)
-        for item in soup.findAll('div', attrs={'class':'title'}):
+        count = 0
-            text_link = item.parent.find('img',attrs={'alt':'Text'})
+        for item in soup.findAll('div', attrs={'class':'articleData'}):
-            if text_link:
+            text_links = item.findAll('h2')
-                url   = self.LOGIN + item.a['href']
+            for text_link in text_links:
-                title = item.a.contents[0]
+                if count == 0:
-                date  = strftime(' %B %Y')
+                   lcover_url = item.find(attrs={'class':'dwpdf'})
-                articles.append({
+                   if lcover_url:
-                                  'title'      :title
+                      self.cover_url = lcover_url.a['href']
-                                 ,'date'       :date
+                   count = 1
-                                 ,'url'        :url
+                else:
-                                 ,'description':''
+                   url   = text_link.a['href']
-                                })
+                   title = text_link.a.contents[0]
                   date  = strftime(' %B %Y')
                   articles.append({
                                      'title'      :title
                                     ,'date'       :date
                                     ,'url'        :url
                                     ,'description':''
                                    })
        return [(soup.head.title.string, articles)]
    def print_version(self, url):
        return url + '?single=1'