From b13b7f8a504d9d95b42091c5f5faf0c71d0db3f9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 23 Apr 2012 13:11:24 +0530 Subject: [PATCH] Fix Der Tagesspiegel --- recipes/tagesspiegel.recipe | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe index 92d88d56ae..71191065f1 100644 --- a/recipes/tagesspiegel.recipe +++ b/recipes/tagesspiegel.recipe @@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe): no_javascript = True remove_empty_feeds = True encoding = 'utf-8' - remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}] + remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}] def print_version(self, url): url = url.split('/') @@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe): return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None articles = {} + links = set() key = None ans = [] maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')}) @@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe): if div['class'] == 'hcf-header': try: - key = string.capwords(feed_title(div.em.a)) + key = string.capwords(feed_title(div.em)) articles[key] = [] ans.append(key) except: @@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe): if not a: continue url = 'http://www.tagesspiegel.de' + a['href'] + + # check for duplicates + if url in links: + continue + links.add(url) + title = self.tag_to_string(a, use_alt=True).strip() description = '' pubdate = strftime('%a, %d %b')