mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
New recipe for Joop by kwetal
This commit is contained in:
parent
3715fd26b2
commit
5bce3d10d3
BIN
resources/images/news/joop.png
Normal file
BIN
resources/images/news/joop.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 395 B |
BIN
resources/images/news/nrcnext.png
Normal file
BIN
resources/images/news/nrcnext.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.7 KiB |
BIN
resources/quick_start.epub
Normal file
BIN
resources/quick_start.epub
Normal file
Binary file not shown.
@ -1,23 +1,29 @@
|
||||
#!/usr/bin/python
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
|
||||
|
||||
class FokkeEnSukkeRecipe(BasicNewsRecipe) :
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
language = 'nl'
|
||||
description = u'Popular Dutch daily cartoon Fokke en Sukke'
|
||||
country = 'NL'
|
||||
version = 2
|
||||
|
||||
title = u'Fokke en Sukke'
|
||||
no_stylesheets = True
|
||||
# For reasons unknown to me the extra css is, on the cartoon pages, inserted in the <body> and not in the <head>. My reader (Sony PRS-600) has a serious issue
|
||||
# with that: it treats it as content and displays it as is. Setting this property to empty solves this for me.
|
||||
template_css = ''
|
||||
INDEX = u'http://foksuk.nl'
|
||||
publisher = u'Reid, Geleijnse & Van Tol'
|
||||
category = u'News, Cartoons'
|
||||
description = u'Popular Dutch daily cartoon Fokke en Sukke'
|
||||
|
||||
# This cover is not as nice as it could be, needs some work
|
||||
#cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
|
||||
conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
|
||||
|
||||
no_stylesheets = True
|
||||
extra_css = '''
|
||||
body{font-family: verdana, arial, helvetica, geneva, sans-serif ; margin: 0em; padding: 0em;}
|
||||
div.title {text-align: center; margin-bottom: 1em;}
|
||||
'''
|
||||
|
||||
INDEX = u'http://foksuk.nl'
|
||||
cover_url = 'http://foksuk.nl/content/wysiwyg/simpleimages/image350.gif'
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'class' : 'cartoon'})]
|
||||
|
||||
@ -31,15 +37,14 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
|
||||
links = index.findAll('a')
|
||||
maxIndex = len(links) - 1
|
||||
articles = []
|
||||
for i in range(len(links)) :
|
||||
# The first link does not interest us, as it points to no cartoon. A begin_at parameter in the range() function would be nice.
|
||||
if i == 0 :
|
||||
continue
|
||||
|
||||
# There can be more than one cartoon for a given day (currently either one or two). If there's only one, there is just a link with the dayname.
|
||||
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>. In that case we're interested in the last two.
|
||||
for i in range(1, len(links)) :
|
||||
# There can be more than one cartoon for a given day (currently either one or two).
|
||||
# If there's only one, there is just a link with the dayname.
|
||||
# If there are two, there are three links in sequence: <a>dayname</a> <a>1</a> <a>2</a>.
|
||||
# In that case we're interested in the last two.
|
||||
if links[i].renderContents() in dayNames :
|
||||
# If the link is not in daynames, we processed it already, but if it is, let's see if the next one has '1' as content
|
||||
# If the link is not in daynames, we processed it already, but if it is, let's see
|
||||
# if the next one has '1' as content
|
||||
if (i + 1 <= maxIndex) and (links[i + 1].renderContents() == '1') :
|
||||
# Got you! Add it to the list
|
||||
article = {'title' : links[i].renderContents() + ' 1', 'date' : u'', 'url' : self.INDEX + links[i + 1]['href'], 'description' : ''}
|
||||
@ -59,29 +64,31 @@ class FokkeEnSukkeRecipe(BasicNewsRecipe) :
|
||||
return [[week, articles]]
|
||||
|
||||
def preprocess_html(self, soup) :
|
||||
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
|
||||
cartoon = soup.find('div', attrs={'class' : 'cartoon'})
|
||||
if cartoon :
|
||||
# It is a cartoon. Extract the title.
|
||||
title = ''
|
||||
img = soup.find('img', attrs = {'alt' : True})
|
||||
if img :
|
||||
title = img['alt']
|
||||
|
||||
# Using the 'extra_css' displays it in the <body> and not in the <head>. See comment at the top of this class. Setting the style this way solves that.
|
||||
tag = Tag(soup, 'div', [('style', 'text-align: center; margin-bottom: 8px')])
|
||||
tag.insert(0, title)
|
||||
cartoon.insert(0, tag)
|
||||
title = ''
|
||||
img = soup.find('img', attrs = {'alt' : True})
|
||||
if img :
|
||||
title = img['alt']
|
||||
|
||||
# I have not quite worked out why, but we have to throw out this part of the page. It contains the very same index we processed earlier,
|
||||
# and Calibre does not like that too much. As far as I can tell it goes into recursion and the result is an empty eBook.
|
||||
select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
|
||||
if select :
|
||||
select.extract()
|
||||
tag = Tag(soup, 'div', [('class', 'title')])
|
||||
tag.insert(0, title)
|
||||
cartoon.insert(0, tag)
|
||||
|
||||
return cartoon
|
||||
else :
|
||||
# It is a TOC. Just return the whole lot.
|
||||
return soup
|
||||
# We only want the cartoon, so throw out the index
|
||||
select = cartoon.find('div', attrs={'class' : 'selectcartoon'})
|
||||
if select :
|
||||
select.extract()
|
||||
|
||||
freshSoup = self.getFreshSoup(soup)
|
||||
freshSoup.body.append(cartoon)
|
||||
|
||||
return freshSoup
|
||||
|
||||
def getFreshSoup(self, oldSoup):
|
||||
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
|
||||
if oldSoup.head.title:
|
||||
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
|
||||
return freshSoup
|
||||
|
||||
|
||||
|
91
resources/recipes/joop.recipe
Normal file
91
resources/recipes/joop.recipe
Normal file
@ -0,0 +1,91 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
import re
|
||||
|
||||
class JoopRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
language = 'nl'
|
||||
country = 'NL'
|
||||
version = 1
|
||||
|
||||
title = u'Joop'
|
||||
publisher = u'Vara'
|
||||
category = u'News, Politics, Discussion'
|
||||
description = u'Political blog from the Netherlands'
|
||||
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
|
||||
no_stylesheets = True
|
||||
remove_javascript = True
|
||||
|
||||
keep_only_tags = []
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': 'author_head clearfix photo'}))
|
||||
keep_only_tags.append(dict(name = 'h2', attrs = {'class': 'columnhead smallline'}))
|
||||
keep_only_tags.append(dict(name = 'div', attrs = {'class': re.compile('article.*')}))
|
||||
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
|
||||
img {margin-right: 0.4em;}
|
||||
h3 {font-size: medium; font-style: italic; font-weight: normal;}
|
||||
h2 {font-size: xx-large; font-weight: bold}
|
||||
sub {color: #666666; font-size: x-small; font-weight: normal;}
|
||||
div.joop_byline {font-size: large}
|
||||
div.joop_byline_job {font-size: small; color: #696969;}
|
||||
div.joop_date {font-size: x-small; font-style: italic; margin-top: 0.6em}
|
||||
'''
|
||||
|
||||
INDEX = 'http://www.joop.nl'
|
||||
|
||||
conversion_options = {'comments': description, 'tags': category, 'language': language,
|
||||
'publisher': publisher}
|
||||
|
||||
def parse_index(self):
|
||||
sections = ['Politiek', 'Wereld', 'Economie', 'Groen', 'Media', 'Leven', 'Show', 'Opinies']
|
||||
soup = self.index_to_soup(self.INDEX)
|
||||
answer = []
|
||||
|
||||
div = soup.find('div', attrs = {'id': 'footer'})
|
||||
for section in sections:
|
||||
articles = []
|
||||
h2 = div.find(lambda tag: tag.name == 'h2' and tag.renderContents() == section)
|
||||
if h2:
|
||||
ul = h2.findNextSibling('ul', 'linklist')
|
||||
if ul:
|
||||
for li in ul.findAll('li'):
|
||||
title = self.tag_to_string(li.a)
|
||||
url = self.INDEX + li.a['href']
|
||||
articles.append({'title': title, 'date': None, 'url': url, 'description': ''})
|
||||
|
||||
answer.append((section, articles))
|
||||
|
||||
return answer
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
div = soup.find('div', 'author_head clearfix photo')
|
||||
if div:
|
||||
h2 = soup.find('h2')
|
||||
if h2:
|
||||
h2.name = 'div'
|
||||
h2['class'] = 'joop_byline'
|
||||
span = h2.find('span')
|
||||
if span:
|
||||
span.name = 'div'
|
||||
span['class'] = 'joop_byline_job'
|
||||
div.replaceWith(h2)
|
||||
|
||||
h2 = soup.find('h2', attrs = {'class': 'columnhead smallline'})
|
||||
if h2:
|
||||
txt = None
|
||||
span = h2.find('span', 'info')
|
||||
if span:
|
||||
txt = span.find(text = True)
|
||||
div = Tag(soup, 'div', attrs = [('class', 'joop_date')])
|
||||
div.append(txt)
|
||||
h2.replaceWith(div)
|
||||
|
||||
return soup
|
||||
|
||||
|
@ -1,29 +1,38 @@
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
|
||||
class NrcNextRecipe(BasicNewsRecipe):
|
||||
__license__ = 'GPL v3'
|
||||
__license__ = 'GPL v3'
|
||||
__author__ = 'kwetal'
|
||||
version = 1
|
||||
language = 'nl'
|
||||
country = 'NL'
|
||||
version = 2
|
||||
|
||||
title = u'nrcnext'
|
||||
publisher = u'NRC Media'
|
||||
category = u'News, Opinion, the Netherlands'
|
||||
description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
|
||||
title = u'nrcnext'
|
||||
|
||||
conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
|
||||
|
||||
no_stylesheets = True
|
||||
template_css = ''
|
||||
remove_javascript = True
|
||||
|
||||
# I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way.
|
||||
keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
|
||||
# If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method.
|
||||
#keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ]
|
||||
|
||||
remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}),
|
||||
dict(name = 'div', attrs = {'class' : 'datumlabel'}),
|
||||
dict(name = 'ul', attrs = {'class' : 'cats single'}),
|
||||
dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}),
|
||||
dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})]
|
||||
remove_tags = []
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'}))
|
||||
remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}))
|
||||
remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'}))
|
||||
|
||||
use_embedded_content = False
|
||||
extra_css = '''
|
||||
body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
|
||||
p.wp-caption-text {font-size: x-small; color: #666666;}
|
||||
h2.sub_title {font-size: medium; color: #696969;}
|
||||
h2.vlag {font-size: small; font-weight: bold;}
|
||||
'''
|
||||
|
||||
def parse_index(self) :
|
||||
# Use the wesbite as an index. Their RSS feeds can be out of date.
|
||||
@ -44,10 +53,11 @@ class NrcNextRecipe(BasicNewsRecipe):
|
||||
# Find the links to the actual articles and rember the location they're pointing to and the title
|
||||
a = post.find('a', attrs={'rel' : 'bookmark'})
|
||||
href = a['href']
|
||||
title = a.renderContents()
|
||||
title = self.tag_to_string(a)
|
||||
|
||||
if index == 'columnisten' :
|
||||
# In this feed/page articles can be written by more than one author. It is nice to see their names in the titles.
|
||||
# In this feed/page articles can be written by more than one author.
|
||||
# It is nice to see their names in the titles.
|
||||
flag = post.find('h2', attrs = {'class' : 'vlag'})
|
||||
author = flag.contents[0].renderContents()
|
||||
completeTitle = u''.join([author, u': ', title])
|
||||
@ -71,44 +81,46 @@ class NrcNextRecipe(BasicNewsRecipe):
|
||||
return answer
|
||||
|
||||
def preprocess_html(self, soup) :
|
||||
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
|
||||
if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}) :
|
||||
# It's an article, find the interesting part
|
||||
if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}):
|
||||
tag = soup.find('div', attrs = {'class' : 'post'})
|
||||
if tag :
|
||||
# And replace any links with their text, so they don't show up underlined on my reader.
|
||||
for link in tag.findAll('a') :
|
||||
link.replaceWith(link.renderContents())
|
||||
if tag:
|
||||
h2 = tag.find('h2', 'vlag')
|
||||
if h2:
|
||||
new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')])
|
||||
new_h2.append(self.tag_to_string(h2))
|
||||
h2.replaceWith(new_h2)
|
||||
else:
|
||||
h2 = tag.find('h2')
|
||||
if h2:
|
||||
new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')])
|
||||
new_h2.append(self.tag_to_string(h2))
|
||||
h2.replaceWith(new_h2)
|
||||
|
||||
# Slows down my Sony reader; feel free to comment out
|
||||
for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}) :
|
||||
h1 = tag.find('h1')
|
||||
if h1:
|
||||
new_h1 = Tag(soup, 'h1')
|
||||
new_h1.append(self.tag_to_string(h1))
|
||||
h1.replaceWith(new_h1)
|
||||
|
||||
# Slows down my reader.
|
||||
for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}):
|
||||
movie.extract()
|
||||
for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}) :
|
||||
for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}):
|
||||
movie.extract()
|
||||
for iframe in tag.findAll('iframe') :
|
||||
iframe.extract()
|
||||
|
||||
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
||||
body = homeMadeSoup.find('body')
|
||||
body.append(tag)
|
||||
fresh_soup = self.getFreshSoup(soup)
|
||||
fresh_soup.body.append(tag)
|
||||
|
||||
return homeMadeSoup
|
||||
else :
|
||||
return fresh_soup
|
||||
else:
|
||||
# This should never happen and other famous last words...
|
||||
return soup
|
||||
else :
|
||||
# It's a TOC, return the whole lot.
|
||||
return soup
|
||||
|
||||
def postproces_html(self, soup) :
|
||||
# Should not happen, but it does. Slows down my Sony eReader
|
||||
for img in soup.findAll('img') :
|
||||
if img['src'].startswith('http://') :
|
||||
img.extract()
|
||||
|
||||
# Happens for some movies which we are not able to view anyway
|
||||
for iframe in soup.findAll('iframe') :
|
||||
if iframe['src'].startswith('http://') :
|
||||
iframe.extract()
|
||||
|
||||
|
||||
|
||||
def getFreshSoup(self, oldSoup):
|
||||
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
|
||||
if oldSoup.head.title:
|
||||
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
|
||||
return freshSoup
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user