Update Brand Eins

2025-08-30 23:00:21 -04:00 · 2014-11-04 09:32:49 +05:30 · 2014-11-04 09:32:49 +05:30 · 3710858a65
commit 3710858a65
parent 0cf985b850
1 changed files with 17 additions and 29 deletions
--- a/recipes/brand_eins.recipe
+++ b/recipes/brand_eins.recipe
@ -7,6 +7,7 @@ __copyright__ = '2014, Nikolas Mangold-Takao <nmangold at gmail.com>'
 __version__   = '0.10'

 ''' http://brandeins.de - Wirtschaftsmagazin '''
+from collections import OrderedDict
 from calibre.web.feeds.recipes import BasicNewsRecipe

 class BrandEins(BasicNewsRecipe):
@ -80,37 +81,24 @@ class BrandEins(BasicNewsRecipe):

    def parse_issue(self, url):
        soup = self.index_to_soup(url)
-        index = soup.find('div', attrs={'class': 'ihv_list'})
+        feeds = OrderedDict()

-        feeds = []
-        sections = index.findAll('section')
+        for item in soup.findAll(attrs={'class':lambda x:'ihv_item' in (x or '').split()}):
+            a = item.findParent('a', href=True)
+            if a is None:
+                continue
+            url = self.PREFIX + a['href']
+            title = self.tag_to_string(item.find(attrs={'class':'ihv_title'}))
+            sec = self.tag_to_string(item.find(attrs={'class':'ihv_page_category'}).findAll('span')[-1])
+            if sec not in feeds:
+                feeds[sec] = []
+            desc = ''
+            for p in item.findAll('p'):
+                desc += self.tag_to_string(p) + '\n'
+            feeds[sec].append({'title':title, 'url':url, 'description':desc})
+            self.log('Found article:', title, 'at', url)

-        # special treatment for 'editorial'. It is not grouped in <section> and title is not in <h3>
-        inhalt_section = index.find('h1', attrs={'class': 'reset'})
-        section_ttl = self.tag_to_string(inhalt_section)
-        #self.log('+++ Found section', section_ttl)
-        editorial_article = inhalt_section.parent.findNextSibling('a')
-        ttl = self.tag_to_string(editorial_article.find('h2', attrs={'class': 'ihv_title'}))
-        url = self.PREFIX + editorial_article['href']
-        #self.log('--- Found article', ttl, url)
-        feeds.append((section_ttl, [{'title': ttl, 'url': url}]))
-
-        #self.log('NUMBER OF SECTIONS', len(sections))
-        for section in sections:
-            section_ttl = self.tag_to_string(section.find('h3'))
-            #self.log('+++ Found section', section_ttl)
-
-            articles = []
-            for article in section.findNextSiblings(['a', 'section']):
-                if (article.name == 'section'):
-                    break
-
-                ttl = self.tag_to_string(article.find('h2', attrs={'class': 'ihv_title'}))
-                url = self.PREFIX + article['href']
-                #self.log('--- Found article', ttl, url)
-                articles.append({'title' : ttl, 'url' : url})
-            feeds.append((section_ttl, articles))
-        return feeds
+        return [(st, articles) for st, articles in feeds.iteritems() if articles]

    def get_cover_url(self):
        # the index does not contain a usable cover, but the "Welt in Zahlen"-article contains it