De Volksrant (subscriber version) by Selcal

2025-12-09 22:55:02 -05:00 · 2011-04-27 11:12:53 -06:00 · 2011-04-27 11:12:53 -06:00 · 92de7e1807
commit 92de7e1807
parent a36e9c4243
1 changed files with 115 additions and 0 deletions
--- a/recipes/volksrant_sub.recipe
+++ b/recipes/volksrant_sub.recipe
@ -0,0 +1,115 @@
+from calibre import strftime
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Volkskrant_full(BasicNewsRecipe):
+    # This recipe will download the Volkskrant newspaper,
+    # from the subscribers site. It requires a password.
+    # Known issues are: articles that are spread out over
+    # multiple pages will appear multiple times. Pages
+    # that contain only adverts will appear, but empty.
+    # The supplement 'Volkskrant Magazine' on saturday
+    # is currently not downloaded.
+    # You can set a manual date, to download an archived
+    # newspaper. Volkskrant stores over a month at the
+    # moment of writing. To do so I suggest you unmark
+    # the date on the line below, and insert it in the title. Then
+    # follow the instructions marked further below.
+
+    title = 'De Volkskrant (subscription)' # [za, 13 nov 2010]'
+    __author__ = u'Selcal'
+    description = u"Volkskrant"
+    oldest_article = 30
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    language = 'nl'
+    use_embedded_content = False
+    simultaneous_downloads = 1
+    delay = 1
+    needs_subscription = True
+    # Set RETRIEVEDATE to 'yyyymmdd' to load an older
+    # edition. Otherwise keep '%Y%m%d'
+    # When setting a manual date, unmark and add the date
+    # to the title above, and unmark the timefmt line to stop
+    # Calibre from adding today's date in addition.
+
+    # timefmt = ''
+    RETRIEVEDATE = strftime('%Y%m%d')
+    INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text'
+    INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/'
+    LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do'
+    remove_tags = [dict(name='address')]
+    cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg'
+
+    def get_browser(self):
+        br = BasicNewsRecipe.get_browser()
+
+        if self.username is not None and self.password is not None:
+           br.open(self.LOGIN)
+           br.select_form(nr = 0)
+           br['username'] = self.username
+           br['password'] = self.password
+           br.submit()
+        return br
+
+    def parse_index(self):
+        krant = []
+        def strip_title(_title):
+            i = 0
+            while ((_title[i] <> ":") and (i <= len(_title))):
+               i = i + 1
+            return(_title[0:i])
+        for temp in range (5):
+              try:
+                soup = self.index_to_soup(self.INDEX_MAIN)
+                break
+              except:
+                #print '(Retrying main index load)'
+                continue
+        mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
+        for option in mainsoup.findAll('option'):
+           articles = []
+           _INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text'
+           _INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/'
+           #print ''
+           #print '<-------    Processing section: ' + _INDEX + ' ------------------------->'
+           for temp in range (5):
+              try:
+                soup = self.index_to_soup(_INDEX)
+                break
+              except:
+                #print '(Retrying index load)'
+                continue
+           for item in soup.findAll('area'):
+              art_nr = item['class']
+              attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)]
+              #print '==> Found: ' + attrname;
+              index_title = soup.find('div', attrs={'class': attrname})
+              get_title = index_title['title'];
+              _ARTICLE   = _INDEX_ARTICLE + attrname + '.html#text'
+              title = get_title;
+              #print '--> Title: ' + title;
+              #print '--> URL: ' + _ARTICLE;
+              for temp in range (5):
+                 try:
+                   souparticle =  self.index_to_soup(_ARTICLE);
+                   break
+                 except:
+                   print '(Retrying URL load)'
+                   continue
+              headerurl = souparticle.findAll('frame')[0]['src'];
+              #print '--> Read frame name for header: ' + headerurl;
+              url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html';
+              #print '--> Corrected URL: ' + url;
+              if (get_title <> ''):
+                 title = strip_title(get_title)
+                 date  = strftime(' %B %Y')
+              if (title <> ''):
+                 articles.append({
+                                         'title'      :title
+                                        ,'date'       :date
+                                        ,'url'        :url
+                                        ,'description':''
+                                        })
+           krant.append( (option.string, articles))
+        return krant
+