diff --git a/src/calibre/web/feeds/recipes/recipe_outlook_india.py b/src/calibre/web/feeds/recipes/recipe_outlook_india.py index 5c440b424d..e931c89b4a 100644 --- a/src/calibre/web/feeds/recipes/recipe_outlook_india.py +++ b/src/calibre/web/feeds/recipes/recipe_outlook_india.py @@ -3,6 +3,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' import re +from calibre import strftime +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class OutlookIndia(BasicNewsRecipe): @@ -13,10 +15,19 @@ class OutlookIndia(BasicNewsRecipe): no_stylesheets = True encoding = 'utf-8' language = _('English') + recursions = 1 + extra_css = ''' + body{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + .fspheading{color:#AF0E25 ; font-family:"Times New Roman",Times,serif; font-weight:bold ; font-size:large; } + .fspauthor{color:#AF0E25; font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + .fspintro{color:#666666; font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + .fspchannelhome{font-family:Arial,Helvetica,sans-serif; font-size:x-small;} + .fspphotocredit{color:##999999; font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} + ''' keep_only_tags = [ - dict(name='div', attrs={'id':["ctl00_cphpagemiddle_reparticle_ctl00_divfullstorytext","ctl00_cphpagemiddle_reparticle_ctl00_divartpic","ctl00_cphpagemiddle_reparticle_ctl00_divfspheading", "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccaption", "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccredit","ctl00_cphpagemiddle_reparticle_ctl00_divfspintro", "ctl00_cphpagemiddle_reparticle_ctl00_divartbyline", ]}), + dict(name='div', attrs={'id':["ctl00_cphpagemiddle_reparticle_ctl00_divfullstorytext","ctl00_cphpagemiddle_reparticle_ctl00_divartpic","ctl00_cphpagemiddle_reparticle_ctl00_divfspheading", "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccaption", "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccredit","ctl00_cphpagemiddle_reparticle_ctl00_divfspintro", "ctl00_cphpagemiddle_reparticle_ctl00_divartbyline", "ctl00_cphpagemiddle_divglitteratiregulars","ctl00_cphpagemiddle_divcartoon","feedbackslatestfirst","ctl00_cphpagemiddle_divregulars","ctl00_cphpagemiddle_divquotes"]}), ] - remove_tags = [dict(name=['script','object'])] + remove_tags = [dict(name=['script','object','hr']),] def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -27,12 +38,11 @@ class OutlookIndia(BasicNewsRecipe): br.set_cookiejar(None) return br - def parse_index(self): soup = self.index_to_soup('http://www.outlookindia.com/issues.aspx') -# find cover pic + # find cover pic div = soup.find('div', attrs={'class':re.compile('cententcellpadding')}) if div is None: return None @@ -47,8 +57,8 @@ class OutlookIndia(BasicNewsRecipe): self.cover_url = cover['src'] - # end find cover pic - + # end find cover pic + #find current issue div = soup.find('table', attrs={'id':re.compile('ctl00_cphpagemiddle_dlissues')}) if div is None: return None @@ -58,25 +68,48 @@ class OutlookIndia(BasicNewsRecipe): href = 'http://www.outlookindia.com/' + a['href'] soup = self.index_to_soup(href) + #find current issue + #find the articles in the current issue articles = [] - for a in soup.findAll('a', attrs={'class':'contentpgsubheadinglink'}): + for a in soup.findAll('a', attrs={'class':['contentpgsubheadinglink',"contentpgtext6",]}): if a and a.has_key('href'): + url = 'http://www.outlookindia.com/' + a['href'] else: url ='' + title = self.tag_to_string(a) + desc = '' date = '' - description = '' articles.append({ 'title':title, 'date':date, 'url':url, - 'description':description + 'description':desc, }) + for a in soup.findAll('a', attrs={'id':["ctl00_cphpageleft_hlglitterati","ctl00_cphpageleft_hlposcape",]}): + + if a and a.has_key('href'): + + url = 'http://www.outlookindia.com/' + a['href'] + else: + url ='' + + title = self.tag_to_string(a) + + desc = '' + date = '' + articles.append({ + 'title':title, + 'date':date, + 'url':url, + 'description':desc, + }) + return [('Current Issue', articles)] @@ -85,11 +118,15 @@ class OutlookIndia(BasicNewsRecipe): del item['style'] return self.adeify_images(soup) + + def postrocess_html(self, soup, first): - for tag in soup.findAll(name=['table', 'tr', 'td','tbody']): - tag.name = 'div' + for item in soup.findAll(align = "left"): + del item['align'] + for tag in soup.findAll(name=['table', 'tr','td','tbody','ul','li','font','span']): + tag.name = 'div' return soup