From 941d5dbda5428bb4a181aa8b486bcaf632897ff1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 1 Jan 2011 10:45:27 -0700 Subject: [PATCH] Updated Brand Eins --- resources/recipes/brand_eins.recipe | 42 ++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/resources/recipes/brand_eins.recipe b/resources/recipes/brand_eins.recipe index 3d62079716..9b77c7f279 100644 --- a/resources/recipes/brand_eins.recipe +++ b/resources/recipes/brand_eins.recipe @@ -1,19 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 mode: python -*- -# Find the newest version of this recipe here: -# https://github.com/consti/BrandEins-Recipe/raw/master/brandeins.recipe - __license__ = 'GPL v3' __copyright__ = '2010, Constantin Hofstetter , Steffen Siebert ' -__version__ = '0.96' +__version__ = '0.97' ''' http://brandeins.de - Wirtschaftsmagazin ''' import re import string +from calibre.ebooks.BeautifulSoup import Tag from calibre.web.feeds.recipes import BasicNewsRecipe - class BrandEins(BasicNewsRecipe): title = u'brand eins' @@ -28,6 +25,8 @@ class BrandEins(BasicNewsRecipe): language = 'de' publication_type = 'magazine' needs_subscription = 'optional' + # Prevent that conversion date is appended to title + timefmt = '' # 2 is the last full magazine (default) # 1 is the newest (but not full) @@ -66,6 +65,13 @@ class BrandEins(BasicNewsRecipe): new_p = "

"+ content +"

" p.replaceWith(new_p) + # Change

to

+ header = soup.find("h3") + if header: + tag = Tag(soup, "h1") + tag.insert(0, header.contents[0]) + header.replaceWith(tag) + return soup def get_cover(self, soup): @@ -77,6 +83,7 @@ class BrandEins(BasicNewsRecipe): def parse_index(self): feeds = [] + issue_map = {} archive = "http://www.brandeins.de/archiv.html" @@ -88,21 +95,31 @@ class BrandEins(BasicNewsRecipe): pass soup = self.index_to_soup(archive) - latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0] - pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue] - url = pre_latest_issue.get('href', False) - # Get month and year of the magazine issue - build it out of the title of the cover - self.timefmt = " " + re.search(r"(?P\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date') + issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a') + issue_list = [i for i in issue_list if i.get('onmouseover', False)] + for i in issue_list: + issue_number_string = i.get('onmouseover', False) + if issue_number_string: + match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string) + issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2))) + issue_map[issue_number] = i + keys = issue_map.keys() + keys.sort() + keys.reverse() + selected_issue = issue_map[keys[issue-1]] + url = selected_issue.get('href', False) + # Get the title for the magazin - build it out of the title of the cover - take the issue and year; + self.title = "brand eins "+ re.search(r"(?P\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date') url = 'http://brandeins.de/'+url # url = "http://www.brandeins.de/archiv/magazin/tierisch.html" - titles_and_articles = self.brand_eins_parse_latest_issue(url) + titles_and_articles = self.brand_eins_parse_issue(url) if titles_and_articles: for title, articles in titles_and_articles: feeds.append((title, articles)) return feeds - def brand_eins_parse_latest_issue(self, url): + def brand_eins_parse_issue(self, url): soup = self.index_to_soup(url) self.cover_url = self.get_cover(soup) article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})] @@ -145,4 +162,3 @@ class BrandEins(BasicNewsRecipe): current_articles.append({'title': title, 'url': url, 'description': description, 'date':''}) titles_and_articles.append([chapter_title, current_articles]) return titles_and_articles -