mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
109 lines
4.5 KiB
Plaintext
109 lines
4.5 KiB
Plaintext
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
|
|
|
|
|
def new_tag(soup, name, attrs=()):
|
|
impl = getattr(soup, 'new_tag', None)
|
|
if impl is not None:
|
|
return impl(name, attrs=dict(attrs))
|
|
return Tag(soup, name, attrs=attrs or None)
|
|
|
|
|
|
class FokkeEnSukkeRecipe(BasicNewsRecipe):
|
|
__license__ = 'GPL v3'
|
|
__author__ = 'kwetal'
|
|
language = 'nl'
|
|
country = 'NL'
|
|
version = 2
|
|
|
|
title = u'Fokke en Sukke'
|
|
publisher = u'Reid, Geleijnse & Van Tol'
|
|
category = u'News, Cartoons'
|
|
description = u'Popular Dutch daily cartoon Fokke en Sukke'
|
|
|
|
conversion_options = {'comments': description,
|
|
'language': language, 'publisher': publisher}
|
|
|
|
no_stylesheets = True
|
|
extra_css = '''
|
|
body{font-family: verdana, arial, helvetica, geneva, sans-serif ; margin: 0em; padding: 0em;}
|
|
div.title {text-align: center; margin-bottom: 1em;}
|
|
'''
|
|
|
|
INDEX = u'http://foksuk.nl'
|
|
cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
|
|
|
|
keep_only_tags = [dict(name='div', attrs={'class': 'cartoon'})]
|
|
|
|
def parse_index(self):
|
|
# A list with daynames as they _can_ appear in the index
|
|
dayNames = ['maandag', 'dinsdag', 'woensdag',
|
|
'donderdag', 'vrijdag', 'zaterdag & zondag']
|
|
soup = self.index_to_soup(self.INDEX)
|
|
|
|
# Find the links for the various cartoons for this week and loop
|
|
# through them
|
|
index = soup.find('div', attrs={'class': 'selectcartoon'})
|
|
links = index.findAll('a')
|
|
maxIndex = len(links) - 1
|
|
articles = []
|
|
for i in range(1, len(links)):
|
|
# There can be more than one cartoon for a given day (currently either one or two).
|
|
# If there's only one, there is just a link with the dayname.
|
|
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
|
|
# In that case we're interested in the last two.
|
|
if links[i].renderContents().decode('utf-8') in dayNames:
|
|
# If the link is not in daynames, we processed it already, but if it is, let's see
|
|
# if the next one has '1' as content
|
|
if (i + 1 <= maxIndex) and (links[i + 1].renderContents().decode('utf-8') == '1'):
|
|
# Got you! Add it to the list
|
|
article = {'title': links[i].renderContents().decode('utf-8'
|
|
)+ ' 1', 'date': u'', 'url': self.INDEX + links[i + 1]['href'], 'description': ''}
|
|
articles.append(article)
|
|
# If there is a '1', there should be a '2' as well, but
|
|
# better save than sorry
|
|
if (i + 2 <= maxIndex) and (links[i + 2].renderContents().decode('utf-8') == '2'):
|
|
# Got you! Add it to the list
|
|
article = {'title': links[i].renderContents(
|
|
).decode('utf-8') + ' 2', 'date': u'', 'url': self.INDEX + links[i + 2]['href'], 'description': ''}
|
|
articles.append(article)
|
|
else:
|
|
# There is only one cartoon for this day. Add it to the
|
|
# list.
|
|
article = {'title': links[i].renderContents(
|
|
).decode('utf-8'), 'date': u'', 'url': self.INDEX + links[i]['href'], 'description': ''}
|
|
articles.append(article)
|
|
# Might as well use the weeknumber as title
|
|
week = index.find('span', attrs={'class': 'week'}).renderContents().decode('utf-8')
|
|
|
|
return [[week, articles]]
|
|
|
|
def preprocess_html(self, soup):
|
|
cartoon = soup.find('div', attrs={'class': 'cartoon'})
|
|
|
|
title = ''
|
|
img = soup.find('img', attrs={'alt': True})
|
|
if img:
|
|
title = img['alt']
|
|
|
|
tag = new_tag(soup, 'div', [('class', 'title')])
|
|
tag.insert(0, title)
|
|
cartoon.insert(0, tag)
|
|
|
|
# We only want the cartoon, so throw out the index
|
|
select = cartoon.find('div', attrs={'class': 'selectcartoon'})
|
|
if select:
|
|
select.extract()
|
|
|
|
freshSoup = self.getFreshSoup(soup)
|
|
freshSoup.body.append(cartoon)
|
|
|
|
return freshSoup
|
|
|
|
def getFreshSoup(self, oldSoup):
|
|
freshSoup = BeautifulSoup(
|
|
'<html><head><title></title></head><body></body></html>')
|
|
if oldSoup.head.title:
|
|
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
|
|
return freshSoup
|