New recipe for Smashing Magazine by Darko Miletic and improved recipe for Die Ziet

This commit is contained in:
Kovid Goyal 2009-09-22 09:13:15 -06:00
parent d0865b25fb
commit 4be28fb1fa
4 changed files with 96 additions and 9 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 843 B

View File

@ -57,7 +57,7 @@ recipe_modules = ['recipe_' + r for r in (
'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti', 'monitor', 'republika', 'beta', 'beta_en', 'glasjavnosti',
'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga', 'esquire', 'livemint', 'thedgesingapore', 'darknet', 'rga',
'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem', 'intelligencer', 'theoldfoodie', 'hln_be', 'honvedelem',
'the_new_republic', 'philly', 'salon', 'tweakers', 'the_new_republic', 'philly', 'salon', 'tweakers', 'smashing',
)] )]

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.smashingmagazine.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class SmashingMagazine(BasicNewsRecipe):
title = 'Smashing Magazine'
__author__ = 'Darko Miletic'
description = 'We smash you with the information that will make your life easier, really'
oldest_article = 20
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Smashing Magazine'
category = 'news, web, IT, css, javascript, html'
encoding = 'utf-8'
conversion_options = {
'comments' : description
,'tags' : category
,'publisher' : publisher
}
keep_only_tags = [dict(name='div', attrs={'id':'leftcolumn'})]
remove_tags_after = dict(name='ul',attrs={'class':'social'})
remove_tags = [
dict(name=['link','object'])
,dict(name='h1',attrs={'class':'logo'})
,dict(name='div',attrs={'id':'booklogosec'})
,dict(attrs={'src':'http://media2.smashingmagazine.com/wp-content/uploads/images/the-smashing-book/smbook6.gif'})
]
feeds = [(u'Articles', u'http://rss1.smashingmagazine.com/feed/')]
def preprocess_html(self, soup):
for iter in soup.findAll('div',attrs={'class':'leftframe'}):
it = iter.find('h1')
if it == None:
iter.extract()
for item in soup.findAll('img'):
oldParent = item.parent
if oldParent.name == 'a':
oldParent.name = 'div'
return soup

View File

@ -13,18 +13,17 @@ class ZeitDe(BasicNewsRecipe):
title = 'Die Zeit Nachrichten' title = 'Die Zeit Nachrichten'
description = 'Die Zeit - Online Nachrichten' description = 'Die Zeit - Online Nachrichten'
language = 'de' language = 'de'
lang = 'de_DE'
__author__ = 'Kovid Goyal and Martin Pitt' __author__ = 'Martin Pitt and Suajta Raman'
use_embedded_content = False use_embedded_content = False
timefmt = ' [%d %b %Y]'
max_articles_per_feed = 40 max_articles_per_feed = 40
remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
encoding = 'utf8' encoding = 'utf-8'
remove_tags = [{'class': 'adwrap'}]
keep_only_tags = [{'name': 'div', 'class': 'content'}]
feeds = [ ('Kurznachrichten', 'http://newsfeed.zeit.de/index'), feeds = [
('Politik', 'http://newsfeed.zeit.de/politik/index'), ('Politik', 'http://newsfeed.zeit.de/politik/index'),
('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'), ('Wirtschaft', 'http://newsfeed.zeit.de/wirtschaft/index'),
('Meinung', 'http://newsfeed.zeit.de/meinung/index'), ('Meinung', 'http://newsfeed.zeit.de/meinung/index'),
@ -33,6 +32,43 @@ class ZeitDe(BasicNewsRecipe):
('Wissen', 'http://newsfeed.zeit.de/wissen/index'), ('Wissen', 'http://newsfeed.zeit.de/wissen/index'),
] ]
def print_version(self,url): extra_css = '''
return url.replace('http://www.zeit.de/', 'http://mobil.zeit.de/') .supertitle{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.excerpt{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:large;}
.title{font-family:Arial,Helvetica,sans-serif;font-size:large}
.caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;}
.article{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small}
.headline iconportrait_inline{font-family:Arial,Helvetica,sans-serif;font-size:x-small}
'''
filter_regexps = [r'ad.de.doubleclick.net/']
keep_only_tags = [
dict(name='div', attrs={'class':["article"]}) ,
]
remove_tags = [
dict(name='link'), dict(name='iframe'),dict(name='style'),
dict(name='div', attrs={'class':["pagination block","pagenav","inline link"] }),
dict(name='div', attrs={'id':["place_5","place_4"]})
]
def get_article_url(self, article):
url = article.get('guid', None)
if 'video' in url or 'quiz' in url :
url = None
return url
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup
#def print_version(self,url):
# return url.replace('http://www.zeit.de/', 'http://images.zeit.de/text/').replace('?from=rss', '')