mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for ncrnext by kwetal. Updated recipe for Harpers.
This commit is contained in:
parent
c7834a5fc1
commit
df73dd322b
@ -29,7 +29,13 @@ class Harpers(BasicNewsRecipe):
|
|||||||
|
|
||||||
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
|
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\noverride_css=" p {text-indent: 0em; margin-top: 0em; margin-bottom: 0.5em} img {margin-top: 0em; margin-bottom: 0.4em}"'
|
||||||
|
|
||||||
|
extra_css = '''
|
||||||
|
h1{ font-family:georgia ; color:#111111; font-size:large;}
|
||||||
|
.box-of-helpful{ font-family:arial ; font-size:x-small;}
|
||||||
|
p{font-family:georgia ;}
|
||||||
|
.caption{font-family:Verdana,sans-serif;font-size:x-small;color:#666666;}
|
||||||
|
'''
|
||||||
|
|
||||||
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
|
keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name='table', attrs={'class':['rcnt','rcnt topline']})
|
dict(name='table', attrs={'class':['rcnt','rcnt topline']})
|
||||||
@ -38,6 +44,17 @@ class Harpers(BasicNewsRecipe):
|
|||||||
|
|
||||||
feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]
|
feeds = [(u"Harper's Magazine", u'http://www.harpers.org/rss/frontpage-rss20.xml')]
|
||||||
|
|
||||||
|
def get_cover_url(self):
|
||||||
|
cover_url = None
|
||||||
|
index = 'http://harpers.org/'
|
||||||
|
soup = self.index_to_soup(index)
|
||||||
|
link_item = soup.find(name = 'img',attrs= {'class':"cover"})
|
||||||
|
print link_item
|
||||||
|
if link_item:
|
||||||
|
cover_url = 'http://harpers.org' + link_item['src']
|
||||||
|
print cover_url
|
||||||
|
return cover_url
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
|
||||||
soup.head.insert(1,mcharset)
|
soup.head.insert(1,mcharset)
|
||||||
@ -47,3 +64,5 @@ class Harpers(BasicNewsRecipe):
|
|||||||
del item['xmlns']
|
del item['xmlns']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
114
resources/recipes/ncrnext.recipe
Normal file
114
resources/recipes/ncrnext.recipe
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
class NrcNextRecipe(BasicNewsRecipe):
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__author__ = 'kwetal'
|
||||||
|
version = 1
|
||||||
|
language = 'nl'
|
||||||
|
description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
|
||||||
|
title = u'nrcnext'
|
||||||
|
|
||||||
|
no_stylesheets = True
|
||||||
|
template_css = ''
|
||||||
|
|
||||||
|
# I want to do some special processing on the articles. I could not solve it with the 'extra_css' property . So we do it the hard way.
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
|
||||||
|
# If that's overkill for you comment out the previous line and uncomment the next. Then get rid of the preprocess_html() method.
|
||||||
|
#keep_only_tags = [dict(name='div', attrs={'class' : 'post'}), dict(name='div', attrs={'class' : 'vlag'}) ]
|
||||||
|
|
||||||
|
remove_tags = [dict(name = 'div', attrs = {'class' : 'meta'}),
|
||||||
|
dict(name = 'div', attrs = {'class' : 'datumlabel'}),
|
||||||
|
dict(name = 'ul', attrs = {'class' : 'cats single'}),
|
||||||
|
dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}),
|
||||||
|
dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})]
|
||||||
|
|
||||||
|
use_embedded_content = False
|
||||||
|
|
||||||
|
def parse_index(self) :
|
||||||
|
# Use the wesbite as an index. Their RSS feeds can be out of date.
|
||||||
|
feeds = {}
|
||||||
|
feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/'
|
||||||
|
feeds[u'koken'] = u'http://www.nrcnext.nl/koken/'
|
||||||
|
feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/'
|
||||||
|
feeds[u'vandaag'] = u'http://www.nrcnext.nl'
|
||||||
|
feeds[u'city life in afrika'] = u'http://www.nrcnext.nl/city-life-in-afrika/'
|
||||||
|
answer = []
|
||||||
|
articles = {}
|
||||||
|
indices = []
|
||||||
|
|
||||||
|
for index, feed in feeds.items() :
|
||||||
|
soup = self.index_to_soup(feed)
|
||||||
|
|
||||||
|
for post in soup.findAll(True, attrs={'class' : 'post'}) :
|
||||||
|
# Find the links to the actual articles and rember the location they're pointing to and the title
|
||||||
|
a = post.find('a', attrs={'rel' : 'bookmark'})
|
||||||
|
href = a['href']
|
||||||
|
title = a.renderContents()
|
||||||
|
|
||||||
|
if index == 'columnisten' :
|
||||||
|
# In this feed/page articles can be written by more than one author. It is nice to see their names in the titles.
|
||||||
|
flag = post.find('h2', attrs = {'class' : 'vlag'})
|
||||||
|
author = flag.contents[0].renderContents()
|
||||||
|
completeTitle = u''.join([author, u': ', title])
|
||||||
|
else :
|
||||||
|
completeTitle = title
|
||||||
|
|
||||||
|
# Add the article to a temporary list
|
||||||
|
article = {'title' : completeTitle, 'date' : u'', 'url' : href, 'description' : '<p> </p>'}
|
||||||
|
if not articles.has_key(index) :
|
||||||
|
articles[index] = []
|
||||||
|
articles[index].append(article)
|
||||||
|
|
||||||
|
# Add the index title to a temporary list
|
||||||
|
indices.append(index)
|
||||||
|
|
||||||
|
# Now, sort the temporary list of feeds in the order they appear on the website
|
||||||
|
indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4})
|
||||||
|
# Apply this sort order to the actual list of feeds and articles
|
||||||
|
answer = [(key, articles[key]) for key in indices if articles.has_key(key)]
|
||||||
|
|
||||||
|
return answer
|
||||||
|
|
||||||
|
def preprocess_html(self, soup) :
|
||||||
|
# This method is called for every page, be it cartoon or TOC. We need to process each in their own way
|
||||||
|
if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}) :
|
||||||
|
# It's an article, find the interesting part
|
||||||
|
tag = soup.find('div', attrs = {'class' : 'post'})
|
||||||
|
if tag :
|
||||||
|
# And replace any links with their text, so they don't show up underlined on my reader.
|
||||||
|
for link in tag.findAll('a') :
|
||||||
|
link.replaceWith(link.renderContents())
|
||||||
|
|
||||||
|
# Slows down my Sony reader; feel free to comment out
|
||||||
|
for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}) :
|
||||||
|
movie.extract()
|
||||||
|
for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}) :
|
||||||
|
movie.extract()
|
||||||
|
|
||||||
|
homeMadeSoup = BeautifulSoup('<html><head></head><body></body></html>')
|
||||||
|
body = homeMadeSoup.find('body')
|
||||||
|
body.append(tag)
|
||||||
|
|
||||||
|
return homeMadeSoup
|
||||||
|
else :
|
||||||
|
# This should never happen and other famous last words...
|
||||||
|
return soup
|
||||||
|
else :
|
||||||
|
# It's a TOC, return the whole lot.
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def postproces_html(self, soup) :
|
||||||
|
# Should not happen, but it does. Slows down my Sony eReader
|
||||||
|
for img in soup.findAll('img') :
|
||||||
|
if img['src'].startswith('http://') :
|
||||||
|
img.extract()
|
||||||
|
|
||||||
|
# Happens for some movies which we are not able to view anyway
|
||||||
|
for iframe in soup.findAll('iframe') :
|
||||||
|
if iframe['src'].startswith('http://') :
|
||||||
|
iframe.extract()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -16,6 +16,7 @@ class Time(BasicNewsRecipe):
|
|||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
remove_javascript = True
|
||||||
|
|
||||||
extra_css = ''' h1 {font-family:Arial,Sans-serif;}
|
extra_css = ''' h1 {font-family:Arial,Sans-serif;}
|
||||||
h2 {font-family:Arial,Sans-serif;}
|
h2 {font-family:Arial,Sans-serif;}
|
||||||
@ -31,14 +32,8 @@ class Time(BasicNewsRecipe):
|
|||||||
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
|
.credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
|
||||||
a:link{color:#CC0000;}
|
a:link{color:#CC0000;}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# remove_tags_before = dict(id="artHd")
|
|
||||||
# remove_tags_after = {'class':"ltCol"}
|
|
||||||
# remove_tags = [
|
|
||||||
# {'class':['articleTools', 'enlarge', 'search','socialtools','blogtools','moretools','page','nextUp','next','subnav','RSS','line2','first','ybuzz','articlePagination','chiclets','imgcont','createListLink','rlinks','tabsWrap','pagination']},
|
|
||||||
# {'id':['quigoArticle', 'contentTools', 'articleSideBar', 'header', 'navTop','articleTools','feedmodule','feedmodule3','promos','footer','linksFooter','timeArchive','belt','relatedStories','packages','Features']},
|
|
||||||
# {'target':'_blank'},
|
|
||||||
# ]
|
|
||||||
|
|
||||||
keep_only_tags = [ dict(name ="div",attrs = {"id" :["article",]}) ,
|
keep_only_tags = [ dict(name ="div",attrs = {"id" :["article",]}) ,
|
||||||
dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
|
dict(name ="div",attrs = {"class" :["artHd","artTxt","photoBkt","vertPhoto","image","copy"]}) ,]
|
||||||
@ -50,6 +45,8 @@ class Time(BasicNewsRecipe):
|
|||||||
recursions = 1
|
recursions = 1
|
||||||
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html']
|
match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html']
|
||||||
|
|
||||||
|
preprocess_regexps = [(re.compile(
|
||||||
|
r'<meta .+/>'), lambda m:'')]
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
soup = self.index_to_soup('http://www.time.com/time/magazine')
|
soup = self.index_to_soup('http://www.time.com/time/magazine')
|
||||||
@ -75,13 +72,19 @@ class Time(BasicNewsRecipe):
|
|||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def find_articles(self, seched):
|
def find_articles(self, seched):
|
||||||
for a in seched.findNextSiblings('a', href=True, attrs={'class':'toc_hed'}):
|
articles = []
|
||||||
yield {
|
for a in seched.findNextSiblings( attrs={'class':['toc_hed','rule2']}):
|
||||||
'title' : self.tag_to_string(a),
|
if a.name in "div":
|
||||||
'url' : 'http://www.time.com'+a['href'],
|
break
|
||||||
'date' : '',
|
else:
|
||||||
'description' : self.article_description(a)
|
yield {
|
||||||
}
|
'title' : self.tag_to_string(a),
|
||||||
|
'url' : 'http://www.time.com'+a['href'],
|
||||||
|
'date' : '',
|
||||||
|
'description' : self.article_description(a)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def article_description(self, a):
|
def article_description(self, a):
|
||||||
ans = []
|
ans = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user