mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
149 lines
6.3 KiB
Plaintext
149 lines
6.3 KiB
Plaintext
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
|
|
|
|
|
def new_tag(soup, name, attrs=()):
|
|
impl = getattr(soup, 'new_tag', None)
|
|
if impl is not None:
|
|
return impl(name, attrs=dict(attrs))
|
|
return Tag(soup, name, attrs=attrs or None)
|
|
|
|
|
|
class NrcNextRecipe(BasicNewsRecipe):
|
|
__license__ = 'GPL v3'
|
|
__author__ = 'kwetal'
|
|
language = 'nl'
|
|
country = 'NL'
|
|
version = 2
|
|
|
|
title = u'nrcnext'
|
|
publisher = u'NRC Media'
|
|
category = u'News, Opinion, the Netherlands'
|
|
description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
|
|
|
|
conversion_options = {'comments': description,
|
|
'language': language, 'publisher': publisher}
|
|
|
|
no_stylesheets = True
|
|
remove_javascript = True
|
|
|
|
keep_only_tags = [dict(name='div', attrs={'id': 'main'})]
|
|
|
|
remove_tags = []
|
|
remove_tags.append(dict(name='div', attrs={'class': 'meta'}))
|
|
remove_tags.append(dict(name='p', attrs={'class': 'meta'}))
|
|
remove_tags.append(dict(name='div', attrs={'class': 'datumlabel'}))
|
|
remove_tags.append(dict(name='div', attrs={'class': 'sharing-is-caring'}))
|
|
remove_tags.append(dict(name='div', attrs={'class': 'navigation'}))
|
|
remove_tags.append(dict(name='div', attrs={'class': 'reageer'}))
|
|
remove_tags.append(dict(name='div', attrs={
|
|
'class': 'comment odd alt thread-odd thread-alt depth-1 reactie '}))
|
|
remove_tags.append(
|
|
dict(name='div', attrs={'class': 'comment even thread-even depth-1 reactie '}))
|
|
remove_tags.append(dict(name='ul', attrs={'class': 'cats single'}))
|
|
remove_tags.append(dict(name='ul', attrs={'class': 'cats onderwerpen'}))
|
|
remove_tags.append(dict(name='ul', attrs={'class': 'cats rubrieken'}))
|
|
remove_tags.append(dict(name='h3', attrs={'class': 'reacties'}))
|
|
|
|
extra_css = '''
|
|
body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
|
|
p.wp-caption-text {font-size: x-small; color: #666666;}
|
|
h2.sub_title {font-size: medium; color: #696969;}
|
|
h2.vlag {font-size: small; font-weight: bold;}
|
|
'''
|
|
|
|
def parse_index(self):
|
|
# Use the wesbite as an index. Their RSS feeds can be out of date.
|
|
feeds = {}
|
|
feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/'
|
|
feeds[u'koken'] = u'http://www.nrcnext.nl/koken/'
|
|
feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/'
|
|
feeds[u'vandaag'] = u'http://www.nrcnext.nl'
|
|
# feeds[u'city life in afrika'] = u'http://www.nrcnext.nl/city-life-in-afrika/'
|
|
answer = []
|
|
articles = {}
|
|
indices = []
|
|
|
|
for index, feed in feeds.items():
|
|
soup = self.index_to_soup(feed)
|
|
for post in soup.findAll(True, attrs={'class': 'post '}):
|
|
# Find the links to the actual articles and rember the location
|
|
# they're pointing to and the title
|
|
a = post.find('a', attrs={'rel': 'bookmark'})
|
|
href = a['href']
|
|
title = self.tag_to_string(a)
|
|
if index == 'columnisten':
|
|
# In this feed/page articles can be written by more than one author.
|
|
# It is nice to see their names in the titles.
|
|
flag = post.find('h2', attrs={'class': 'vlag'})
|
|
author = flag.contents[0].renderContents().decode('utf-8')
|
|
completeTitle = u''.join([author, u': ', title])
|
|
else:
|
|
completeTitle = title
|
|
|
|
# Add the article to a temporary list
|
|
article = {'title': completeTitle, 'date': u'',
|
|
'url': href, 'description': '<p> </p>'}
|
|
if index not in articles:
|
|
articles[index] = []
|
|
articles[index].append(article)
|
|
|
|
# Add the index title to a temporary list
|
|
indices.append(index)
|
|
|
|
# Now, sort the temporary list of feeds in the order they appear on the website
|
|
# indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4})
|
|
indices = self.sort_index_by(
|
|
indices, {u'columnisten': 1, u'koken': 3, u'geld & werk': 2, u'vandaag': 0})
|
|
# Apply this sort order to the actual list of feeds and articles
|
|
answer = [(key, articles[key])
|
|
for key in indices if key in articles]
|
|
|
|
return answer
|
|
|
|
def preprocess_html(self, soup):
|
|
if soup.find('div', attrs={'id': 'main', 'class': 'single'}):
|
|
tag = soup.find('div', attrs={'class': 'post'})
|
|
if tag:
|
|
h2 = tag.find('h2', 'vlag')
|
|
if h2:
|
|
new_h2 = new_tag(soup, 'h2', attrs=[('class', 'vlag')])
|
|
new_h2.append(self.tag_to_string(h2))
|
|
h2.replaceWith(new_h2)
|
|
else:
|
|
h2 = tag.find('h2')
|
|
if h2:
|
|
new_h2 = new_tag(soup, 'h2', attrs=[
|
|
('class', 'sub_title')])
|
|
new_h2.append(self.tag_to_string(h2))
|
|
h2.replaceWith(new_h2)
|
|
|
|
h1 = tag.find('h1')
|
|
if h1:
|
|
new_h1 = new_tag(soup, 'h1')
|
|
new_h1.append(self.tag_to_string(h1))
|
|
h1.replaceWith(new_h1)
|
|
|
|
# Slows down my reader.
|
|
for movie in tag.findAll('span', attrs={'class': 'vvqbox vvqvimeo'}):
|
|
movie.extract()
|
|
for movie in tag.findAll('span', attrs={'class': 'vvqbox vvqyoutube'}):
|
|
movie.extract()
|
|
for iframe in tag.findAll('iframe'):
|
|
iframe.extract()
|
|
|
|
fresh_soup = self.getFreshSoup(soup)
|
|
fresh_soup.body.append(tag)
|
|
|
|
return fresh_soup
|
|
else:
|
|
# This should never happen and other famous last words...
|
|
return soup
|
|
|
|
def getFreshSoup(self, oldSoup):
|
|
freshSoup = BeautifulSoup(
|
|
'<html><head><title></title></head><body></body></html>')
|
|
if oldSoup.head.title:
|
|
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
|
|
return freshSoup
|