From f457341a363a784ba7d1772cf81c51adb7b5d82e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 4 Sep 2009 09:41:37 -0600 Subject: [PATCH] Fix Newsweek recipe --- .../web/feeds/recipes/recipe_economist.py | 4 +++- .../web/feeds/recipes/recipe_newsweek.py | 23 ++++++++++++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/calibre/web/feeds/recipes/recipe_economist.py b/src/calibre/web/feeds/recipes/recipe_economist.py index de557b9cf7..6965cfe0c4 100644 --- a/src/calibre/web/feeds/recipes/recipe_economist.py +++ b/src/calibre/web/feeds/recipes/recipe_economist.py @@ -62,7 +62,9 @@ class Economist(BasicNewsRecipe): a = tag.find('a', href=True) if a is not None: url=a['href'].replace('displaystory', 'PrinterFriendly').strip() - if url.startswith('/') or url.startswith('PrinterF'): + if url.startswith('Printer'): + url = '/'+url + if url.startswith('/'): url = 'http://www.economist.com' + url try: subtitle = tag.previousSibling.contents[0].contents[0] diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index 9a6ef77cee..b0e86e8adb 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -89,6 +89,17 @@ class Newsweek(BasicNewsRecipe): return cmp(tx, ty) return sorted(ans, cmp=fcmp) + def ensure_html(self, soup): + root = soup.find(name=True) + if root.name == 'html': return soup + nsoup = BeautifulSoup('') + nroot = nsoup.find(name='body') + for x in soup.contents: + if getattr(x, 'name', False): + x.extract() + nroot.insert(len(nroot), x) + return nsoup + def postprocess_html(self, soup, first_fetch): if not first_fetch: h1 = soup.find(id='headline') @@ -99,7 +110,7 @@ class Newsweek(BasicNewsRecipe): div.extract() divs = list(soup.findAll('div', 'pagination')) if not divs: - return soup + return self.ensure_html(soup) for div in divs[1:]: div.extract() all_a = divs[0].findAll('a', href=True) divs[0]['style']="display:none" @@ -109,7 +120,7 @@ class Newsweek(BasicNewsRecipe): for a in soup.findAll('a', href=test): if a not in all_a: del a['href'] - return soup + return self.ensure_html(soup) def get_current_issue(self): soup = self.index_to_soup('http://www.newsweek.com') @@ -132,7 +143,7 @@ class Newsweek(BasicNewsRecipe): def postprocess_book(self, oeb, opts, log) : def extractByline(href) : - soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) byline = soup.find(True,attrs={'class':'authorInfo'}) byline = self.tag_to_string(byline) if byline is not None else '' issueDate = soup.find(True,attrs={'class':'issueDate'}) @@ -142,7 +153,7 @@ class Newsweek(BasicNewsRecipe): return byline + ' | ' + issueDate else : return byline + issueDate - + def extractDescription(href) : soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) description = soup.find(True,attrs={'name':'description'}) @@ -156,8 +167,8 @@ class Newsweek(BasicNewsRecipe): description = soup.find(True, attrs={'class':'story'}) firstPara = soup.find('p') description = self.tag_to_string(firstPara) - return description - + return description + for section in oeb.toc : for article in section : if article.author is None :