mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
De Volksrant (subscriber version) by Selcal
This commit is contained in:
parent
a36e9c4243
commit
92de7e1807
115
recipes/volksrant_sub.recipe
Normal file
115
recipes/volksrant_sub.recipe
Normal file
@ -0,0 +1,115 @@
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class Volkskrant_full(BasicNewsRecipe):
|
||||
# This recipe will download the Volkskrant newspaper,
|
||||
# from the subscribers site. It requires a password.
|
||||
# Known issues are: articles that are spread out over
|
||||
# multiple pages will appear multiple times. Pages
|
||||
# that contain only adverts will appear, but empty.
|
||||
# The supplement 'Volkskrant Magazine' on saturday
|
||||
# is currently not downloaded.
|
||||
# You can set a manual date, to download an archived
|
||||
# newspaper. Volkskrant stores over a month at the
|
||||
# moment of writing. To do so I suggest you unmark
|
||||
# the date on the line below, and insert it in the title. Then
|
||||
# follow the instructions marked further below.
|
||||
|
||||
title = 'De Volkskrant (subscription)' # [za, 13 nov 2010]'
|
||||
__author__ = u'Selcal'
|
||||
description = u"Volkskrant"
|
||||
oldest_article = 30
|
||||
max_articles_per_feed = 100
|
||||
no_stylesheets = True
|
||||
language = 'nl'
|
||||
use_embedded_content = False
|
||||
simultaneous_downloads = 1
|
||||
delay = 1
|
||||
needs_subscription = True
|
||||
# Set RETRIEVEDATE to 'yyyymmdd' to load an older
|
||||
# edition. Otherwise keep '%Y%m%d'
|
||||
# When setting a manual date, unmark and add the date
|
||||
# to the title above, and unmark the timefmt line to stop
|
||||
# Calibre from adding today's date in addition.
|
||||
|
||||
# timefmt = ''
|
||||
RETRIEVEDATE = strftime('%Y%m%d')
|
||||
INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text'
|
||||
INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/'
|
||||
LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do'
|
||||
remove_tags = [dict(name='address')]
|
||||
cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg'
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
|
||||
if self.username is not None and self.password is not None:
|
||||
br.open(self.LOGIN)
|
||||
br.select_form(nr = 0)
|
||||
br['username'] = self.username
|
||||
br['password'] = self.password
|
||||
br.submit()
|
||||
return br
|
||||
|
||||
def parse_index(self):
|
||||
krant = []
|
||||
def strip_title(_title):
|
||||
i = 0
|
||||
while ((_title[i] <> ":") and (i <= len(_title))):
|
||||
i = i + 1
|
||||
return(_title[0:i])
|
||||
for temp in range (5):
|
||||
try:
|
||||
soup = self.index_to_soup(self.INDEX_MAIN)
|
||||
break
|
||||
except:
|
||||
#print '(Retrying main index load)'
|
||||
continue
|
||||
mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
|
||||
for option in mainsoup.findAll('option'):
|
||||
articles = []
|
||||
_INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text'
|
||||
_INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/'
|
||||
#print ''
|
||||
#print '<------- Processing section: ' + _INDEX + ' ------------------------->'
|
||||
for temp in range (5):
|
||||
try:
|
||||
soup = self.index_to_soup(_INDEX)
|
||||
break
|
||||
except:
|
||||
#print '(Retrying index load)'
|
||||
continue
|
||||
for item in soup.findAll('area'):
|
||||
art_nr = item['class']
|
||||
attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)]
|
||||
#print '==> Found: ' + attrname;
|
||||
index_title = soup.find('div', attrs={'class': attrname})
|
||||
get_title = index_title['title'];
|
||||
_ARTICLE = _INDEX_ARTICLE + attrname + '.html#text'
|
||||
title = get_title;
|
||||
#print '--> Title: ' + title;
|
||||
#print '--> URL: ' + _ARTICLE;
|
||||
for temp in range (5):
|
||||
try:
|
||||
souparticle = self.index_to_soup(_ARTICLE);
|
||||
break
|
||||
except:
|
||||
print '(Retrying URL load)'
|
||||
continue
|
||||
headerurl = souparticle.findAll('frame')[0]['src'];
|
||||
#print '--> Read frame name for header: ' + headerurl;
|
||||
url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html';
|
||||
#print '--> Corrected URL: ' + url;
|
||||
if (get_title <> ''):
|
||||
title = strip_title(get_title)
|
||||
date = strftime(' %B %Y')
|
||||
if (title <> ''):
|
||||
articles.append({
|
||||
'title' :title
|
||||
,'date' :date
|
||||
,'url' :url
|
||||
,'description':''
|
||||
})
|
||||
krant.append( (option.string, articles))
|
||||
return krant
|
||||
|
Loading…
x
Reference in New Issue
Block a user