mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
De Volksrant (subscriber version) by Selcal
This commit is contained in:
parent
a36e9c4243
commit
92de7e1807
115
recipes/volksrant_sub.recipe
Normal file
115
recipes/volksrant_sub.recipe
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
from calibre import strftime
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Volkskrant_full(BasicNewsRecipe):
|
||||||
|
# This recipe will download the Volkskrant newspaper,
|
||||||
|
# from the subscribers site. It requires a password.
|
||||||
|
# Known issues are: articles that are spread out over
|
||||||
|
# multiple pages will appear multiple times. Pages
|
||||||
|
# that contain only adverts will appear, but empty.
|
||||||
|
# The supplement 'Volkskrant Magazine' on saturday
|
||||||
|
# is currently not downloaded.
|
||||||
|
# You can set a manual date, to download an archived
|
||||||
|
# newspaper. Volkskrant stores over a month at the
|
||||||
|
# moment of writing. To do so I suggest you unmark
|
||||||
|
# the date on the line below, and insert it in the title. Then
|
||||||
|
# follow the instructions marked further below.
|
||||||
|
|
||||||
|
title = 'De Volkskrant (subscription)' # [za, 13 nov 2010]'
|
||||||
|
__author__ = u'Selcal'
|
||||||
|
description = u"Volkskrant"
|
||||||
|
oldest_article = 30
|
||||||
|
max_articles_per_feed = 100
|
||||||
|
no_stylesheets = True
|
||||||
|
language = 'nl'
|
||||||
|
use_embedded_content = False
|
||||||
|
simultaneous_downloads = 1
|
||||||
|
delay = 1
|
||||||
|
needs_subscription = True
|
||||||
|
# Set RETRIEVEDATE to 'yyyymmdd' to load an older
|
||||||
|
# edition. Otherwise keep '%Y%m%d'
|
||||||
|
# When setting a manual date, unmark and add the date
|
||||||
|
# to the title above, and unmark the timefmt line to stop
|
||||||
|
# Calibre from adding today's date in addition.
|
||||||
|
|
||||||
|
# timefmt = ''
|
||||||
|
RETRIEVEDATE = strftime('%Y%m%d')
|
||||||
|
INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text'
|
||||||
|
INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/'
|
||||||
|
LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do'
|
||||||
|
remove_tags = [dict(name='address')]
|
||||||
|
cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg'
|
||||||
|
|
||||||
|
def get_browser(self):
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
|
||||||
|
if self.username is not None and self.password is not None:
|
||||||
|
br.open(self.LOGIN)
|
||||||
|
br.select_form(nr = 0)
|
||||||
|
br['username'] = self.username
|
||||||
|
br['password'] = self.password
|
||||||
|
br.submit()
|
||||||
|
return br
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
krant = []
|
||||||
|
def strip_title(_title):
|
||||||
|
i = 0
|
||||||
|
while ((_title[i] <> ":") and (i <= len(_title))):
|
||||||
|
i = i + 1
|
||||||
|
return(_title[0:i])
|
||||||
|
for temp in range (5):
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(self.INDEX_MAIN)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
#print '(Retrying main index load)'
|
||||||
|
continue
|
||||||
|
mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
|
||||||
|
for option in mainsoup.findAll('option'):
|
||||||
|
articles = []
|
||||||
|
_INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text'
|
||||||
|
_INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/'
|
||||||
|
#print ''
|
||||||
|
#print '<------- Processing section: ' + _INDEX + ' ------------------------->'
|
||||||
|
for temp in range (5):
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(_INDEX)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
#print '(Retrying index load)'
|
||||||
|
continue
|
||||||
|
for item in soup.findAll('area'):
|
||||||
|
art_nr = item['class']
|
||||||
|
attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)]
|
||||||
|
#print '==> Found: ' + attrname;
|
||||||
|
index_title = soup.find('div', attrs={'class': attrname})
|
||||||
|
get_title = index_title['title'];
|
||||||
|
_ARTICLE = _INDEX_ARTICLE + attrname + '.html#text'
|
||||||
|
title = get_title;
|
||||||
|
#print '--> Title: ' + title;
|
||||||
|
#print '--> URL: ' + _ARTICLE;
|
||||||
|
for temp in range (5):
|
||||||
|
try:
|
||||||
|
souparticle = self.index_to_soup(_ARTICLE);
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
print '(Retrying URL load)'
|
||||||
|
continue
|
||||||
|
headerurl = souparticle.findAll('frame')[0]['src'];
|
||||||
|
#print '--> Read frame name for header: ' + headerurl;
|
||||||
|
url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html';
|
||||||
|
#print '--> Corrected URL: ' + url;
|
||||||
|
if (get_title <> ''):
|
||||||
|
title = strip_title(get_title)
|
||||||
|
date = strftime(' %B %Y')
|
||||||
|
if (title <> ''):
|
||||||
|
articles.append({
|
||||||
|
'title' :title
|
||||||
|
,'date' :date
|
||||||
|
,'url' :url
|
||||||
|
,'description':''
|
||||||
|
})
|
||||||
|
krant.append( (option.string, articles))
|
||||||
|
return krant
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user