#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' from calibre.web.feeds.news import BasicNewsRecipe def absurl(x): if x.startswith('/'): x = 'http://www.outlookindia.com' + x return x class OutlookIndia(BasicNewsRecipe): title = 'Outlook India' __author__ = 'Kovid Goyal' description = 'Weekly news and current affairs in India' no_stylesheets = True encoding = 'utf-8' language = 'en_IN' ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [ dict(name='h1'), dict( attrs={'class': ['sub_head', 'magzine_stry_image', 'mainContent']}), dict(attrs={'class': lambda x: x and set( x.split()).intersection({'writter', 'covr_wr'})}), ] remove_tags = [ dict(name='meta'), ] def get_browser(self): br = BasicNewsRecipe.get_browser(self) # This site sends article titles in the cookie which occasionally # contain non ascii characters causing httplib to fail. Instead just # disable cookies as they're not needed for download. Proper solution # would be to implement a unicode aware cookie jar br.set_cookiejar(None) return br def preprocess_raw_html(self, raw_html, url): import html5lib from lxml import html root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False) return html.tostring(root) def parse_index(self): soup = self.index_to_soup('http://www.outlookindia.com/magazine') for img in soup.findAll('img', src=lambda x: x and 'Latest-Cover.jpg' in x): self.cover_url = absurl(img['src']) self.log('Found cover:', self.cover_url) articles = [] for a in soup.findAll('a', href=lambda x: x and x.startswith('/magazine/story/')): url = absurl(a['href']) title = self.tag_to_string(a) desc = '' div = a.parent.findNextSibling(attrs={'class': 'descriptn'}) if div is not None: desc = self.tag_to_string(div) self.log('Found article:', title, 'at', url) articles.append({'title': title, 'url': url, 'description': desc}) return [('Current Issue', articles)]