__license__ = 'GPL v3' __copyright__ = '2009-2010, Darko Miletic ' ''' www.businessworld.in ''' import re from calibre.web.feeds.news import BasicNewsRecipe class BusinessWorldMagazine(BasicNewsRecipe): title = 'Business World Magazine' __author__ = 'Kovid Goyal' description = 'News from India' publisher = 'ABP Pvt Ltd Publication' category = 'news, politics, finances, India, Asia' delay = 1 no_stylesheets = True INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php' ROOT = 'http://www.businessworld.in' encoding = 'utf-8' language = 'en_IN' auto_cleanup = True def parse_index(self): br = self.browser br.open(self.ROOT) raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue', re.I))).read() soup = self.index_to_soup(raw) mc = soup.find(attrs={'class':'mag_cover'}) if mc is not None: img = mc.find('img', src=True) if img is not None: self.cover_url = img['src'] feeds = [] current_section = None articles = [] for tag in soup.findAll(['h3', 'h2']): inner_a = tag.find('a') if tag.name == 'h3' and inner_a is not None: continue if tag.name == 'h2' and (inner_a is None or current_section is None): continue if tag.name == 'h3': if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(tag) self.log('Found section:', current_section) articles = [] elif tag.name == 'h2': url = inner_a.get('href', None) if url is None: continue if url.startswith('/'): url = self.ROOT + url title = self.tag_to_string(inner_a) h1 = tag.findPreviousSibling('h1') if h1 is not None: title = self.tag_to_string(h1) + title self.log('\tFound article:', title) articles.append({'title':title, 'url':url, 'date':'', 'description':''}) if current_section and articles: feeds.append((current_section, articles)) return feeds