diff --git a/resources/images/news/joop.png b/resources/images/news/joop.png
new file mode 100644
index 0000000000..0ea5e422e1
Binary files /dev/null and b/resources/images/news/joop.png differ
diff --git a/resources/images/news/nrcnext.png b/resources/images/news/nrcnext.png
new file mode 100644
index 0000000000..1349755925
Binary files /dev/null and b/resources/images/news/nrcnext.png differ
diff --git a/resources/quick_start.epub b/resources/quick_start.epub
new file mode 100644
index 0000000000..a70f9f13ec
Binary files /dev/null and b/resources/quick_start.epub differ
diff --git a/resources/recipes/fokkeensukke.recipe b/resources/recipes/fokkeensukke.recipe
index 3ddbe1cfe5..76a4aa39b9 100644
--- a/resources/recipes/fokkeensukke.recipe
+++ b/resources/recipes/fokkeensukke.recipe
@@ -1,23 +1,29 @@
-#!/usr/bin/python
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class FokkeEnSukkeRecipe(BasicNewsRecipe) :
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'nl'
- description = u'Popular Dutch daily cartoon Fokke en Sukke'
+ country = 'NL'
+ version = 2
title = u'Fokke en Sukke'
- no_stylesheets = True
- # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the
and not in the . My reader (Sony PRS-600) has a serious issue
- # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
- template_css = ''
- INDEX = u'http://foksuk.nl'
+ publisher = u'Reid, Geleijnse & Van Tol'
+ category = u'News, Cartoons'
+ description = u'Popular Dutch daily cartoon Fokke en Sukke'
- # This cover is not as nice as it could be, needs some work
- #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
+ conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
+
+ no_stylesheets = True
+ extra_css = '''
+ body{font-family: verdana, arial, helvetica, geneva, sans-serif ; margin: 0em; padding: 0em;}
+ div.title {text-align: center; margin-bottom: 1em;}
+ '''
+
+ INDEX = u'http://foksuk.nl'
+ cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
@@ -31,15 +37,14 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
links = index.findAll('a')
maxIndex = len(links) - 1
articles = []
- for i in range(len(links)) :
- # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
- if i == 0 :
- continue
-
- # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
- # If there are two, there are three links in sequence: dayname 1 2. In that case we're interested in the last two.
+ for i in range(1, len(links)) :
+ # There can be more than one cartoon for a given day (currently either one or two).
+ # If there's only one, there is just a link with the dayname.
+ # If there are two, there are three links in sequence: dayname 1 2.
+ # In that case we're interested in the last two.
if links[i].renderContents() in dayNames :
- # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
+ # If the link is not in daynames, we processed it already, but if it is, let's see
+ # if the next one has '1' as content
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
# Got you! Add it to the list
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
@@ -59,29 +64,31 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
return [[week, articles]]
def preprocess_html(self, soup) :
- # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
cartoon = soup.find('div', attrs={'class' : 'cartoon'})
- if cartoon :
- # It is a cartoon. Extract the title.
- title = ''
- img = soup.find('img', attrs = {'alt' : True})
- if img :
- title = img['alt']
- # Using the 'extra_css' displays it in the and not in the . See comment at the top of this class. Setting the style this way solves that.
- tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
- tag.insert(0, title)
- cartoon.insert(0, tag)
+ title = ''
+ img = soup.find('img', attrs = {'alt' : True})
+ if img :
+ title = img['alt']
- # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
- # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
- select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
- if select :
- select.extract()
+ tag = Tag(soup, 'div', [('class', 'title')])
+ tag.insert(0, title)
+ cartoon.insert(0, tag)
- return cartoon
- else :
- # It is a TOC. Just return the whole lot.
- return soup
+ # We only want the cartoon, so throw out the index
+ select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
+ if select :
+ select.extract()
+
+ freshSoup = self.getFreshSoup(soup)
+ freshSoup.body.append(cartoon)
+
+ return freshSoup
+
+ def getFreshSoup(self, oldSoup):
+ freshSoup = BeautifulSoup('')
+ if oldSoup.head.title:
+ freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
+ return freshSoup
diff --git a/resources/recipes/joop.recipe b/resources/recipes/joop.recipe
new file mode 100644
index 0000000000..a913328b9b
--- /dev/null
+++ b/resources/recipes/joop.recipe
@@ -0,0 +1,91 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
+import re
+
+class JoopRecipe(BasicNewsRecipe):
+ __license__ = 'GPL v3'
+ __author__ = 'kwetal'
+ language = 'nl'
+ country = 'NL'
+ version = 1
+
+ title = u'Joop'
+ publisher = u'Vara'
+ category = u'News, Politics, Discussion'
+ description = u'Political blog from the Netherlands'
+
+ oldest_article = 7
+ max_articles_per_feed = 100
+ use_embedded_content = False
+
+ no_stylesheets = True
+ remove_javascript = True
+
+ keep_only_tags = []
+ keep_only_tags.append(dict(name = 'div', attrs = {'class': 'author_head clearfix photo'}))
+ keep_only_tags.append(dict(name = 'h2', attrs = {'class': 'columnhead smallline'}))
+ keep_only_tags.append(dict(name = 'div', attrs = {'class': re.compile('article.*')}))
+
+ extra_css = '''
+ body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
+ img {margin-right: 0.4em;}
+ h3 {font-size: medium; font-style: italic; font-weight: normal;}
+ h2 {font-size: xx-large; font-weight: bold}
+ sub {color: #666666; font-size: x-small; font-weight: normal;}
+ div.joop_byline {font-size: large}
+ div.joop_byline_job {font-size: small; color: #696969;}
+ div.joop_date {font-size: x-small; font-style: italic; margin-top: 0.6em}
+ '''
+
+ INDEX = 'http://www.joop.nl'
+
+ conversion_options = {'comments': description, 'tags': category, 'language': language,
+ 'publisher': publisher}
+
+ def parse_index(self):
+ sections = ['Politiek', 'Wereld', 'Economie', 'Groen', 'Media', 'Leven', 'Show', 'Opinies']
+ soup = self.index_to_soup(self.INDEX)
+ answer = []
+
+ div = soup.find('div', attrs = {'id': 'footer'})
+ for section in sections:
+ articles = []
+ h2 = div.find(lambda tag: tag.name == 'h2' and tag.renderContents() == section)
+ if h2:
+ ul = h2.findNextSibling('ul', 'linklist')
+ if ul:
+ for li in ul.findAll('li'):
+ title = self.tag_to_string(li.a)
+ url = self.INDEX + li.a['href']
+ articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
+
+ answer.append((section, articles))
+
+ return answer
+
+ def preprocess_html(self, soup):
+ div = soup.find('div', 'author_head clearfix photo')
+ if div:
+ h2 = soup.find('h2')
+ if h2:
+ h2.name = 'div'
+ h2['class'] = 'joop_byline'
+ span = h2.find('span')
+ if span:
+ span.name = 'div'
+ span['class'] = 'joop_byline_job'
+ div.replaceWith(h2)
+
+ h2 = soup.find('h2', attrs = {'class': 'columnhead smallline'})
+ if h2:
+ txt = None
+ span = h2.find('span', 'info')
+ if span:
+ txt = span.find(text = True)
+ div = Tag(soup, 'div', attrs = [('class', 'joop_date')])
+ div.append(txt)
+ h2.replaceWith(div)
+
+ return soup
+
+
diff --git a/resources/recipes/ncrnext.recipe b/resources/recipes/ncrnext.recipe
index d8a51e62c8..e03da301fa 100644
--- a/resources/recipes/ncrnext.recipe
+++ b/resources/recipes/ncrnext.recipe
@@ -1,29 +1,38 @@
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class NrcNextRecipe(BasicNewsRecipe):
- __license__ = 'GPL v3'
+ __license__ = 'GPL v3'
__author__ = 'kwetal'
- version = 1
language = 'nl'
+ country = 'NL'
+ version = 2
+
+ title = u'nrcnext'
+ publisher = u'NRC Media'
+ category = u'News, Opinion, the Netherlands'
description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
- title = u'nrcnext'
+
+ conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
no_stylesheets = True
- template_css = ''
+ remove_javascript = True
- # I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way.
keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
- # If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method.
- #keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ]
- remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}),
- dict(name = 'div', attrs = {'class' : 'datumlabel'}),
- dict(name = 'ul', attrs = {'class' : 'cats single'}),
- dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}),
- dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})]
+ remove_tags = []
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'}))
+ remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'}))
+ remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}))
+ remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'}))
- use_embedded_content = False
+ extra_css = '''
+ body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
+ p.wp-caption-text {font-size: x-small; color: #666666;}
+ h2.sub_title {font-size: medium; color: #696969;}
+ h2.vlag {font-size: small; font-weight: bold;}
+ '''
def parse_index(self) :
# Use the wesbite as an index. Their RSS feeds can be out of date.
@@ -44,10 +53,11 @@ class NrcNextRecipe(BasicNewsRecipe):
# Find the links to the actual articles and rember the location they're pointing to and the title
a = post.find('a', attrs={'rel' : 'bookmark'})
href = a['href']
- title = a.renderContents()
+ title = self.tag_to_string(a)
if index == 'columnisten' :
- # In this feed/page articles can be written by more than one author. It is nice to see their names in the titles.
+ # In this feed/page articles can be written by more than one author.
+ # It is nice to see their names in the titles.
flag = post.find('h2', attrs = {'class' : 'vlag'})
author = flag.contents[0].renderContents()
completeTitle = u''.join([author, u': ', title])
@@ -71,44 +81,46 @@ class NrcNextRecipe(BasicNewsRecipe):
return answer
def preprocess_html(self, soup) :
- # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
- if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}) :
- # It's an article, find the interesting part
+ if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}):
tag = soup.find('div', attrs = {'class' : 'post'})
- if tag :
- # And replace any links with their text, so they don't show up underlined on my reader.
- for link in tag.findAll('a') :
- link.replaceWith(link.renderContents())
+ if tag:
+ h2 = tag.find('h2', 'vlag')
+ if h2:
+ new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')])
+ new_h2.append(self.tag_to_string(h2))
+ h2.replaceWith(new_h2)
+ else:
+ h2 = tag.find('h2')
+ if h2:
+ new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')])
+ new_h2.append(self.tag_to_string(h2))
+ h2.replaceWith(new_h2)
- # Slows down my Sony reader; feel free to comment out
- for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}) :
+ h1 = tag.find('h1')
+ if h1:
+ new_h1 = Tag(soup, 'h1')
+ new_h1.append(self.tag_to_string(h1))
+ h1.replaceWith(new_h1)
+
+ # Slows down my reader.
+ for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}):
movie.extract()
- for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}) :
+ for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}):
movie.extract()
+ for iframe in tag.findAll('iframe') :
+ iframe.extract()
- homeMadeSoup = BeautifulSoup('')
- body = homeMadeSoup.find('body')
- body.append(tag)
+ fresh_soup = self.getFreshSoup(soup)
+ fresh_soup.body.append(tag)
- return homeMadeSoup
- else :
+ return fresh_soup
+ else:
# This should never happen and other famous last words...
return soup
- else :
- # It's a TOC, return the whole lot.
- return soup
-
- def postproces_html(self, soup) :
- # Should not happen, but it does. Slows down my Sony eReader
- for img in soup.findAll('img') :
- if img['src'].startswith('http://') :
- img.extract()
-
- # Happens for some movies which we are not able to view anyway
- for iframe in soup.findAll('iframe') :
- if iframe['src'].startswith('http://') :
- iframe.extract()
-
-
+ def getFreshSoup(self, oldSoup):
+ freshSoup = BeautifulSoup('')
+ if oldSoup.head.title:
+ freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
+ return freshSoup