From 4efa4d7bb1f497485edb29826996853037bb405d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 19 Sep 2009 20:53:04 -0600 Subject: [PATCH] Fix #3416 (Receipt Spiegel Online - German => no articles) --- .../web/feeds/recipes/recipe_der_standard.py | 8 ++-- .../web/feeds/recipes/recipe_spiegelde.py | 45 ++++++------------- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_der_standard.py b/src/calibre/web/feeds/recipes/recipe_der_standard.py index effda75b47..65bf9a9746 100644 --- a/src/calibre/web/feeds/recipes/recipe_der_standard.py +++ b/src/calibre/web/feeds/recipes/recipe_der_standard.py @@ -24,7 +24,6 @@ class DerStandardRecipe(BasicNewsRecipe): oldest_article = 1 max_articles_per_feed = 100 - extra_css = ''' .artikelBody{font-family:Arial,Helvetica,sans-serif;} .artikelLeft{font-family:Arial,Helvetica,sans-serif;font-size:x-small;} @@ -59,14 +58,15 @@ class DerStandardRecipe(BasicNewsRecipe): filter_regexps = [r'/r[1-9]*'] - #def print_version(self, url): - # return url.replace('?id=', 'txt/?id=') - def get_article_url(self, article): '''if the article links to a index page (ressort) or a picture gallery (ansichtssache), don't add it''' if ( article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0 ): return None + matchObj = re.search( re.compile(r'/r'+'[1-9]*',flags=0), article.link,flags=0) + + if matchObj: + return None return article.link diff --git a/src/calibre/web/feeds/recipes/recipe_spiegelde.py b/src/calibre/web/feeds/recipes/recipe_spiegelde.py index 2f90bb073e..705ffd0f7a 100644 --- a/src/calibre/web/feeds/recipes/recipe_spiegelde.py +++ b/src/calibre/web/feeds/recipes/recipe_spiegelde.py @@ -7,7 +7,6 @@ spiegel.de ''' from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag class Spiegel_ger(BasicNewsRecipe): title = 'Spiegel Online - German' @@ -17,49 +16,31 @@ class Spiegel_ger(BasicNewsRecipe): category = 'SPIEGEL ONLINE, DER SPIEGEL, Nachrichten, News,Dienste, RSS, RSS, Feedreader, Newsfeed, iGoogle, Netvibes, Widget' oldest_article = 7 max_articles_per_feed = 100 - language = 'de' - + language = 'de' lang = 'de-DE' no_stylesheets = True use_embedded_content = False encoding = 'cp1252' - html2lrf_options = [ - '--comment', description - , '--category', category - , '--publisher', publisher - ] + conversion_options = { + 'comment' : description + , 'tags' : category + , 'publisher' : publisher + , 'language' : lang + } - html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"' - keep_only_tags = [dict(name='div', attrs={'id':'spMainContent'})] + keep_only_tags = [dict(name='div', attrs={'id':'spArticleContent'})] - remove_tags = [dict(name=['object','link','base'])] + remove_tags = [dict(name=['object','link','base','iframe'])] remove_tags_after = dict(name='div', attrs={'id':'spArticleBody'}) feeds = [(u'Spiegel Online', u'http://www.spiegel.de/schlagzeilen/index.rss')] def print_version(self, url): - main, sep, rest = url.rpartition(',') + rmt = url.rpartition('#')[0] + main, sep, rest = rmt.rpartition(',') rmain, rsep, rrest = main.rpartition(',') - return rmain + ',druck-' + rrest + ',' + rest - - def preprocess_html(self, soup): - mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)]) - mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=UTF-8")]) - soup.head.insert(0,mlang) - soup.head.insert(1,mcharset) - for item in soup.findAll(style=True): - del item['style'] - htmltag = soup.find('html') - if not htmltag: - thtml = Tag(soup,'html',[("lang",self.lang),("xml:lang",self.lang),("dir","ltr")]) - soup.insert(0,thtml) - thead = soup.head - tbody = soup.body - thead.extract() - tbody.extract() - soup.html.insert(0,tbody) - soup.html.insert(0,thead) - return soup + purl = rmain + ',druck-' + rrest + ',' + rest + return purl