New recipe for US News by Darko Miletic

This commit is contained in:
Kovid Goyal 2009-04-30 17:44:23 -07:00
parent e869684a29
commit 2ee77796fb
4 changed files with 125 additions and 48 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 466 B

View File

@ -41,7 +41,7 @@ recipe_modules = ['recipe_' + r for r in (
'corriere_della_sera_it', 'corriere_della_sera_en', 'msdnmag_en',
'moneynews', 'der_standard', 'diepresse', 'nzz_ger', 'hna',
'seattle_times', 'scott_hanselman', 'coding_horror',
'stackoverflow', 'telepolis_artikel', 'zaobao',
'stackoverflow', 'telepolis_artikel', 'zaobao', 'usnews',
)]
import re, imp, inspect, time, os

View File

@ -1,47 +1,64 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
Fetch Spiegel Online.
spiegel.de
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class SpeigelOnline(BasicNewsRecipe):
title = 'Spiegel Online'
description = 'Nachrichten des Magazins Der Spiegel'
__author__ = 'Kovid Goyal'
use_embedded_content = False
class Spiegel_ger(BasicNewsRecipe):
title = 'Spiegel Online - German'
__author__ = 'Darko Miletic'
description = "Immer die neueste Meldung auf dem Schirm, sekundenaktuell und uebersichtlich: Mit dem RSS-Angebot von SPIEGEL ONLINE entgeht Ihnen keine wichtige Meldung mehr, selbst wenn Sie keinen Internet-Browser geoeffnet haben. Sie koennen unsere Nachrichten-Feeds ganz einfach abonnieren - unkompliziert, kostenlos und nach Ihren persoenlichen Themen-Vorlieben."
publisher = 'SPIEGEL ONLINE Gmbh'
category = 'SPIEGEL ONLINE, DER SPIEGEL, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget'
oldest_article = 7
max_articles_per_feed = 100
language = _('German')
timefmt = ' [ %Y-%m-%d %a]'
max_articles_per_feed = 40
lang = 'de-DE'
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
# Remove Zum Thema footer
(r'<div class="spArticleCredit.*?</body>', lambda match: '</body>'),
]
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
feeds= [ ('Spiegel Online', 'http://www.spiegel.de/schlagzeilen/rss/0,5291,,00.xml') ]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'spMainContent'})]
def print_version(self,url):
tokens = url.split(',')
tokens[-2:-2] = ['druck|']
return ','.join(tokens).replace('|,','-')
remove_tags = [dict(name=['object','link','base'])]
def postprocess_html(self, soup, first_fetch):
if soup.contents[0].name == 'head':
x = BeautifulSoup('<html></html>')
for y in reversed(soup.contents):
x.contents[0].insert(0, y)
soup = x
remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'})
feeds = [(u'Spiegel Online', u'http://www.spiegel.de/schlagzeilen/index.rss')]
def print_version(self, url):
main, sep, rest = url.rpartition(',')
rmain, rsep, rrest = main.rpartition(',')
return rmain + ',druck-' + rrest + ',' + rest
def preprocess_html(self, soup):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
htmltag = soup.find('html')
if not htmltag:
thtml = Tag(soup,'html',[("lang",self.lang),("xml:lang",self.lang),("dir","ltr")])
soup.insert(0,thtml)
thead = soup.head
tbody = soup.body
thead.extract()
tbody.extract()
soup.html.insert(0,tbody)
soup.html.insert(0,thead)
return soup

View File

@ -0,0 +1,60 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.usnews.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LaPrensa(BasicNewsRecipe):
title = 'US & World Report news'
__author__ = 'Darko Miletic'
description = 'News from USA and world'
publisher = 'U.S.News & World Report, L.P.'
category = 'news, politics, USA'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = _('English')
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [
dict(name='h1')
,dict(name='div', attrs={'id':['dateline']})
,dict(name='div', attrs={'class':['blogCredit','body']})
]
feeds = [
(u'Homepage' , u'http://www.usnews.com/rss/usnews.rss' )
,(u'Health' , u'http://www.usnews.com/rss/health/index.rss' )
,(u'Nation & World' , u'http://www.usnews.com/rss/news/index.rss' )
,(u'Money & Business', u'http://www.usnews.com/rss/business/index.rss' )
,(u'Education' , u'http://www.usnews.com/rss/education/index.rss' )
,(u'Opinion' , u'http://www.usnews.com/rss/opinion/index.rss' )
,(u'Science' , u'http://www.usnews.com/rss/science/index.rss' )
]
def print_version(self, url):
return url.replace('.html','_print.html')
def get_article_url(self, article):
raw = article.get('link', None)
artcl, sep, unneeded = raw.rpartition('?')
return artcl
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
return soup