From d0a1ce48258e6c16237c109eeb50f2efa28c2bce Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 30 Jul 2009 12:52:55 -0600 Subject: [PATCH] Improved Newsweek recipe (thanks to GRiker) --- .../web/feeds/recipes/recipe_newsweek.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index c7f043ef74..9a6ef77cee 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -4,6 +4,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' import re from calibre import strftime +from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): @@ -128,3 +129,39 @@ class Newsweek(BasicNewsRecipe): return cover_url + def postprocess_book(self, oeb, opts, log) : + + def extractByline(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + byline = soup.find(True,attrs={'class':'authorInfo'}) + byline = self.tag_to_string(byline) if byline is not None else '' + issueDate = soup.find(True,attrs={'class':'issueDate'}) + issueDate = self.tag_to_string(issueDate) if issueDate is not None else '' + issueDate = re.sub(',','', issueDate) + if byline > '' and issueDate > '' : + return byline + ' | ' + issueDate + else : + return byline + issueDate + + def extractDescription(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + description = soup.find(True,attrs={'name':'description'}) + if description is not None and description.has_key('content'): + description = description['content'] + if description.startswith('Newsweek magazine online plus') : + description = soup.find(True, attrs={'class':'story'}) + firstPara = soup.find('p') + description = self.tag_to_string(firstPara) + else : + description = soup.find(True, attrs={'class':'story'}) + firstPara = soup.find('p') + description = self.tag_to_string(firstPara) + return description + + for section in oeb.toc : + for article in section : + if article.author is None : + article.author = extractByline(article.href) + if article.description is None : + article.description = extractDescription(article.href) + return