From be7114ba3acc8c650b48ff8881f40ae704b20c5f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2011 21:21:21 -0600 Subject: [PATCH] ... --- recipes/businessworldin.recipe | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/recipes/businessworldin.recipe b/recipes/businessworldin.recipe index e44682d7e1..a4c774ccdb 100644 --- a/recipes/businessworldin.recipe +++ b/recipes/businessworldin.recipe @@ -4,6 +4,7 @@ __copyright__ = '2009-2010, Darko Miletic ' www.businessworld.in ''' +import re from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -15,7 +16,7 @@ class BusinessWorldMagazine(BasicNewsRecipe): category = 'news, politics, finances, India, Asia' delay = 1 no_stylesheets = True - INDEX = 'http://www.businessworld.in/bw/Magazine_Current_Issue' + INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php' ROOT = 'http://www.businessworld.in' use_embedded_content = False encoding = 'utf-8' @@ -38,13 +39,17 @@ class BusinessWorldMagazine(BasicNewsRecipe): if litem == url: return True return False - - + + def parse_index(self): articles = [] linklist = [] - soup = self.index_to_soup(self.INDEX) - + br = self.browser + br.open(self.ROOT) + raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue', + re.I))).read() + soup = self.index_to_soup(raw) + tough = soup.find('div', attrs={'id':'tough'}) if tough: for item in tough.findAll('h1'): @@ -63,7 +68,7 @@ class BusinessWorldMagazine(BasicNewsRecipe): ,'description':description }) linklist.append(url) - + for item in soup.findAll('div', attrs={'class':'nametitle'}): description = '' title_prefix = '' @@ -82,7 +87,7 @@ class BusinessWorldMagazine(BasicNewsRecipe): linklist.append(url) return [(soup.head.title.string, articles)] - + keep_only_tags = [dict(name='div', attrs={'id':'printwrapper'})] remove_tags = [dict(name=['object','link','meta','base','iframe','link','table'])]