From 6de89762ee42194e564d0c9fa546f20305c1b4c7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 21 May 2010 09:47:48 -0600 Subject: [PATCH] Der Tagesspiegel by ipaschke --- resources/recipes/tagesspiegel.recipe | 86 +++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 resources/recipes/tagesspiegel.recipe diff --git a/resources/recipes/tagesspiegel.recipe b/resources/recipes/tagesspiegel.recipe new file mode 100644 index 0000000000..e5d2600ae0 --- /dev/null +++ b/resources/recipes/tagesspiegel.recipe @@ -0,0 +1,86 @@ +__license__ = 'GPL v3' +__copyright__ = '2010 Ingo Paschke ' + +''' +Fetch Tagesspiegel. +''' +import string, re +from calibre import strftime +from calibre.web.feeds.news import BasicNewsRecipe + +class TagesspiegelRSS(BasicNewsRecipe): + title = u'Der Tagesspiegel' + __author__ = 'ipaschke' + language = 'de' + oldest_article = 7 + max_articles_per_feed = 100 + + extra_css = ''' + .hcf-overline{color:#990000; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;display:block} + .hcf-teaser{font-family:Verdana,Arial,Helvetica;font-size:x-small;margin-top:0} + h1{font-family:Arial,Helvetica,sans-serif;font-size:large;clear:right;} + .hcf-caption{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .hcf-copyright{color:#666666; font-family:Arial,Helvetica,sans-serif;font-size:xx-small;} + .hcf-article{font-family:Arial,Helvetica;font-size:x-small} + .quote{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:x-small} + .quote .cite{font-family:Georgia,Palatino,Palatino Linotype,FreeSerif,serif;font-size:xx-small} + .hcf-inline-left{float:left;margin-right:15px;position:relative;} + .hcf-inline-right{float:right;margin-right:15px;position:relative;} + .hcf-smart-box{font-family: Arial, Helvetica, sans-serif; font-size: xx-small; margin: 0px 15px 8px 0px; width: 300px;} + ''' + + no_stylesheets = True + no_javascript = True + remove_empty_feeds = True + encoding = 'utf-8' + + keep_only_tags = dict(name='div', attrs={'class':["hcf-article"]}) + remove_tags = [ + dict(name='link'), dict(name='iframe'),dict(name='style'),dict(name='meta'),dict(name='button'), + dict(name='div', attrs={'class':["hcf-jump-to-comments","hcf-clear","hcf-magnify hcf-media-control"] }), + dict(name='span', attrs={'class':["hcf-mainsearch",] }), + dict(name='ul', attrs={'class':["hcf-tools"] }), + ] + + def parse_index(self): + soup = self.index_to_soup('http://www.tagesspiegel.de/zeitung/') + + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=False)).strip() + + articles = {} + key = None + ans = [] + + for div in soup.findAll(True, attrs={'class':['hcf-teaser', 'hcf-header', 'story headline']}): + + if div['class'] == 'hcf-header': + key = string.capwords(feed_title(div.em.a)) + articles[key] = [] + ans.append(key) + + elif div['class'] == 'hcf-teaser' and getattr(div.contents[0],'name','') == 'h2': + a = div.find('a', href=True) + if not a: + continue + url = 'http://www.tagesspiegel.de' + a['href'] + title = self.tag_to_string(a, use_alt=True).strip() + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find('p', attrs={'class':'hcf-teaser'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + articles[feed] = [] + if not 'podcasts' in url: + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=re.sub('mehr$', '', description), + content='')) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + + return ans +