Fix #3416 (Receipt Spiegel Online - German => no articles)

This commit is contained in:
Kovid Goyal 2009-09-19 20:53:04 -06:00
parent ebfc8ec40f
commit 4efa4d7bb1
2 changed files with 17 additions and 36 deletions

View File

@ -24,7 +24,6 @@ class DerStandardRecipe(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 100
extra_css = '''
.artikelBody{font-family:Arial,Helvetica,sans-serif;}
.artikelLeft{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
@ -59,14 +58,15 @@ class DerStandardRecipe(BasicNewsRecipe):
filter_regexps = [r'/r[1-9]*']
#def print_version(self, url):
# return url.replace('?id=', 'txt/?id=')
def get_article_url(self, article):
'''if the article links to a index page (ressort) or a picture gallery
(ansichtssache), don't add it'''
if ( article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0 ):
return None
matchObj = re.search( re.compile(r'/r'+'[1-9]*',flags=0), article.link,flags=0)
if matchObj:
return None
return article.link

View File

@ -7,7 +7,6 @@ spiegel.de
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Spiegel_ger(BasicNewsRecipe):
title = 'Spiegel Online - German'
@ -17,49 +16,31 @@ class Spiegel_ger(BasicNewsRecipe):
category = 'SPIEGEL ONLINE, DER SPIEGEL, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget'
oldest_article = 7
max_articles_per_feed = 100
language = 'de'
language = 'de'
lang = 'de-DE'
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
}
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':'spMainContent'})]
keep_only_tags = [dict(name='div', attrs={'id':'spArticleContent'})]
remove_tags = [dict(name=['object','link','base'])]
remove_tags = [dict(name=['object','link','base','iframe'])]
remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'})
feeds = [(u'Spiegel Online', u'http://www.spiegel.de/schlagzeilen/index.rss')]
def print_version(self, url):
main, sep, rest = url.rpartition(',')
rmt = url.rpartition('#')[0]
main, sep, rest = rmt.rpartition(',')
rmain, rsep, rrest = main.rpartition(',')
return rmain + ',druck-' + rrest + ',' + rest
def preprocess_html(self, soup):
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
for item in soup.findAll(style=True):
del item['style']
htmltag = soup.find('html')
if not htmltag:
thtml = Tag(soup,'html',[("lang",self.lang),("xml:lang",self.lang),("dir","ltr")])
soup.insert(0,thtml)
thead = soup.head
tbody = soup.body
thead.extract()
tbody.extract()
soup.html.insert(0,tbody)
soup.html.insert(0,thead)
return soup
purl = rmain + ',druck-' + rrest + ',' + rest
return purl