New recipe for Joop by kwetal

2025-11-02 10:37:01 -05:00 · 2010-01-16 21:15:23 -07:00 · 2010-01-16 21:15:23 -07:00 · 5bce3d10d3
commit 5bce3d10d3
parent 3715fd26b2
6 changed files with 195 additions and 85 deletions
--- a/resources/images/news/joop.png
+++ b/resources/images/news/joop.png
--- a/resources/images/news/nrcnext.png
+++ b/resources/images/news/nrcnext.png
--- a/resources/quick_start.epub
+++ b/resources/quick_start.epub
--- a/resources/recipes/fokkeensukke.recipe
+++ b/resources/recipes/fokkeensukke.recipe
@ -1,23 +1,29 @@
-#!/usr/bin/python
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


 class FokkeEnSukkeRecipe(BasicNewsRecipe) :
    __license__   = 'GPL v3'
    __author__ = 'kwetal'
    language = 'nl'
-    description = u'Popular Dutch daily cartoon Fokke en Sukke'
+    country = 'NL'
+    version = 2

    title = u'Fokke en Sukke'
-    no_stylesheets = True
-    # For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
-    # with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
-    template_css = ''
-    INDEX = u'http://foksuk.nl'
+    publisher = u'Reid, Geleijnse & Van Tol'
+    category = u'News, Cartoons'
+    description = u'Popular Dutch daily cartoon Fokke en Sukke'

-    # This cover is not as nice as it could be, needs some work
-    #cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
+    conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
+
+    no_stylesheets = True
+    extra_css = '''
+                    body{font-family: verdana, arial, helvetica, geneva, sans-serif ; margin: 0em; padding: 0em;}
+                    div.title {text-align: center; margin-bottom: 1em;}
+                    '''
+
+    INDEX = u'http://foksuk.nl'
+    cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'

    keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]

@ -31,15 +37,14 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
        links = index.findAll('a')
        maxIndex = len(links) - 1
        articles = []
-        for i in range(len(links)) :
-            # The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
-            if i == 0 :
-                continue
-
-            # There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
-            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
+        for i in range(1, len(links)) :
+            # There can be more than one cartoon for a given day (currently either one or two).
+            # If there's only one, there is just a link with the dayname.
+            # If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
+            # In that case we're interested in the last two.
            if links[i].renderContents() in dayNames :
-                # If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
+                # If the link is not in daynames, we processed it already, but if it is, let's see
+                # if the next one has '1' as content
                if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
                    # Got you! Add it to the list
                    article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url'  : self.INDEX + links[i + 1]['href'], 'description' : ''}
@ -59,29 +64,31 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
        return [[week, articles]]

    def preprocess_html(self, soup) :
-        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
        cartoon = soup.find('div', attrs={'class' : 'cartoon'})
-        if cartoon :
-            # It is a cartoon. Extract the title.
+
        title = ''
        img = soup.find('img', attrs = {'alt' : True})
        if img :
            title = img['alt']

-            # Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
-            tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
+        tag = Tag(soup, 'div', [('class', 'title')])
        tag.insert(0, title)
        cartoon.insert(0, tag)

-            # I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
-            # and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
+        # We only want the cartoon, so throw out the index
        select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
        if select :
            select.extract()

-            return cartoon
-        else :
-            # It is a TOC. Just return the whole lot.
-            return soup
+        freshSoup = self.getFreshSoup(soup)
+        freshSoup.body.append(cartoon)
+
+        return freshSoup
+
+    def getFreshSoup(self, oldSoup):
+        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
+        if oldSoup.head.title:
+            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
+        return freshSoup


--- a/resources/recipes/joop.recipe
+++ b/resources/recipes/joop.recipe
@ -0,0 +1,91 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag
+import re
+
+class JoopRecipe(BasicNewsRecipe):
+    __license__  = 'GPL v3'
+    __author__ = 'kwetal'
+    language = 'nl'
+    country = 'NL'
+    version = 1
+
+    title = u'Joop'
+    publisher = u'Vara'
+    category = u'News, Politics, Discussion'
+    description = u'Political blog from the Netherlands'
+
+    oldest_article = 7
+    max_articles_per_feed = 100
+    use_embedded_content = False
+
+    no_stylesheets = True
+    remove_javascript = True
+
+    keep_only_tags = []
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': 'author_head clearfix photo'}))
+    keep_only_tags.append(dict(name = 'h2', attrs = {'class': 'columnhead smallline'}))
+    keep_only_tags.append(dict(name = 'div', attrs = {'class': re.compile('article.*')}))
+
+    extra_css = '''
+                body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
+                img {margin-right: 0.4em;}
+                h3 {font-size: medium; font-style: italic; font-weight: normal;}
+                h2 {font-size: xx-large; font-weight: bold}
+                sub {color: #666666; font-size: x-small; font-weight: normal;}
+                div.joop_byline {font-size: large}
+                div.joop_byline_job {font-size: small; color: #696969;}
+                div.joop_date {font-size: x-small; font-style: italic; margin-top: 0.6em}
+                '''
+
+    INDEX = 'http://www.joop.nl'
+
+    conversion_options = {'comments': description, 'tags': category, 'language': language,
+                          'publisher': publisher}
+
+    def parse_index(self):
+        sections = ['Politiek', 'Wereld', 'Economie', 'Groen', 'Media', 'Leven', 'Show', 'Opinies']
+        soup = self.index_to_soup(self.INDEX)
+        answer = []
+
+        div = soup.find('div', attrs = {'id': 'footer'})
+        for section in sections:
+            articles = []
+            h2 = div.find(lambda tag: tag.name == 'h2' and tag.renderContents() == section)
+            if h2:
+                ul = h2.findNextSibling('ul', 'linklist')
+                if ul:
+                    for li in ul.findAll('li'):
+                        title = self.tag_to_string(li.a)
+                        url = self.INDEX + li.a['href']
+                        articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
+
+            answer.append((section, articles))
+
+        return answer
+
+    def preprocess_html(self, soup):
+        div = soup.find('div', 'author_head clearfix photo')
+        if div:
+            h2 = soup.find('h2')
+            if h2:
+                h2.name = 'div'
+                h2['class'] = 'joop_byline'
+                span = h2.find('span')
+                if span:
+                    span.name = 'div'
+                    span['class'] = 'joop_byline_job'
+                div.replaceWith(h2)
+
+        h2 = soup.find('h2', attrs = {'class': 'columnhead smallline'})
+        if h2:
+            txt = None
+            span = h2.find('span', 'info')
+            if span:
+                txt = span.find(text = True)
+            div = Tag(soup, 'div', attrs = [('class', 'joop_date')])
+            div.append(txt)
+            h2.replaceWith(div)
+
+        return soup
+
+
--- a/resources/recipes/ncrnext.recipe
+++ b/resources/recipes/ncrnext.recipe
@ -1,29 +1,38 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

 class NrcNextRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
-    version = 1
    language = 'nl'
-    description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
+    country = 'NL'
+    version = 2
+
    title = u'nrcnext'
+    publisher = u'NRC Media'
+    category = u'News, Opinion, the Netherlands'
+    description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
+
+    conversion_options = {'comments': description, 'language': language, 'publisher': publisher}

    no_stylesheets = True
-    template_css = ''
+    remove_javascript = True

-    # I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way.
    keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
-    # If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method.
-    #keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ]

-    remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}),
-                          dict(name = 'div', attrs = {'class' : 'datumlabel'}),
-                          dict(name = 'ul', attrs = {'class' : 'cats single'}),
-                          dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}),
-                          dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})]
+    remove_tags = []
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'}))
+    remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}))
+    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'}))

-    use_embedded_content = False
+    extra_css = '''
+                body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
+                p.wp-caption-text {font-size: x-small; color: #666666;}
+                h2.sub_title {font-size: medium; color: #696969;}
+                h2.vlag {font-size: small; font-weight: bold;}
+                '''

    def parse_index(self) :
        # Use the wesbite as an index. Their RSS feeds can be out of date.
@ -44,10 +53,11 @@ class NrcNextRecipe(BasicNewsRecipe):
                # Find the links to the actual articles and rember the location they're pointing to and the title
                a = post.find('a', attrs={'rel' : 'bookmark'})
                href = a['href']
-                title = a.renderContents()
+                title = self.tag_to_string(a)

                if index == 'columnisten' :
-                    # In this feed/page articles can be written by more than one author. It is nice to see their names in the titles.
+                    # In this feed/page articles can be written by more than one author.
+                    # It is nice to see their names in the titles.
                    flag = post.find('h2', attrs = {'class' : 'vlag'})
                    author = flag.contents[0].renderContents()
                    completeTitle = u''.join([author, u': ', title])
@ -71,44 +81,46 @@ class NrcNextRecipe(BasicNewsRecipe):
        return answer

    def preprocess_html(self, soup) :
-        # This method is called for every page, be it cartoon or TOC. We need to process each in their own way
-        if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}) :
-            # It's an article, find the interesting part
+        if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}):
            tag = soup.find('div', attrs = {'class' : 'post'})
-            if tag :
-                # And replace any links with their text, so they don't show up underlined on my reader.
-                for link in tag.findAll('a') :
-                    link.replaceWith(link.renderContents())
+            if tag:
+                h2 = tag.find('h2', 'vlag')
+                if h2:
+                    new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')])
+                    new_h2.append(self.tag_to_string(h2))
+                    h2.replaceWith(new_h2)
+                else:
+                    h2 = tag.find('h2')
+                    if h2:
+                        new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')])
+                        new_h2.append(self.tag_to_string(h2))
+                        h2.replaceWith(new_h2)

-                # Slows down my Sony reader; feel free to comment out
-                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}) :
+                h1 = tag.find('h1')
+                if h1:
+                    new_h1 = Tag(soup, 'h1')
+                    new_h1.append(self.tag_to_string(h1))
+                    h1.replaceWith(new_h1)
+
+                # Slows down my reader.
+                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}):
                    movie.extract()
-                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}) :
+                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}):
                    movie.extract()
-
-                homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
-                body = homeMadeSoup.find('body')
-                body.append(tag)
-
-                return homeMadeSoup
-            else :
-                # This should never happen and other famous last words...
-                return soup
-        else :
-            # It's a TOC, return the whole lot.
-            return soup
-
-    def postproces_html(self, soup) :
-        # Should not happen, but it does. Slows down my Sony eReader
-        for img in soup.findAll('img') :
-            if img['src'].startswith('http://') :
-                img.extract()
-
-        # Happens for some movies which we are not able to view anyway
-        for iframe in soup.findAll('iframe') :
-            if iframe['src'].startswith('http://') :
+                for iframe in tag.findAll('iframe') :
                    iframe.extract()

+                fresh_soup = self.getFreshSoup(soup)
+                fresh_soup.body.append(tag)

+                return fresh_soup
+            else:
+                # This should never happen and other famous last words...
+                return soup

+    def getFreshSoup(self, oldSoup):
+        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
+        if oldSoup.head.title:
+            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
+        return freshSoup