Updated recipe for Outlook India

2025-07-09 03:04:10 -04:00 · 2009-08-23 12:03:24 -06:00 · 2009-08-23 12:03:24 -06:00 · 5b361e348b
commit 5b361e348b
parent e9d1b7b72a
2 changed files with 89 additions and 81 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -679,7 +679,7 @@ class BasicNewsRecipe(Recipe):
            fetcher.browser_lock = fetcher.DUMMY_LOCK
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
-            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
+            raise Exception(_('Could not fetch article. Run with -vv to see the reason'))
        return res, path, failures
    def fetch_article(self, url, dir, f, a, num_of_feeds):
--- a/src/calibre/web/feeds/recipes/recipe_outlook_india.py
+++ b/src/calibre/web/feeds/recipes/recipe_outlook_india.py
@ -1,87 +1,95 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+__copyright__ = '2009, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 outlookindia.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class OutlookIndia(BasicNewsRecipe):
    title = 'Outlook India'
    __author__  = 'Kovid Goyal'
    description = 'Weekly news magazine focused on India.'
    language = _('English')
    recursions = 1
    match_regexp = r'full.asp.*&pn=\d+'
    remove_tags = [
                   dict(name='img', src="images/space.gif"),
                   dict(name=lambda tag: tag.name == 'tr' and tag.find('img', src="image/tl.gif") is not None ),
                   dict(name=lambda tag: tag.name == 'table' and tag.find('font', attrs={'class':'fontemailfeed'}) is not None),
                   ]
    preprocess_regexps = [
                          (re.compile(r'<body.*?<!--Add Banner ends from here-->', re.DOTALL|re.IGNORECASE),
                           lambda match: '<body>'),
                          (re.compile(r'>More Stories:.*', re.DOTALL), 
                           lambda match: '></body></html>'),
                          (re.compile(r'<!-- Google panel start -->.*', re.DOTALL),
                           lambda match: '</body></html>'), 
                          ]
    def parse_index(self):
        soup = self.index_to_soup('http://www.outlookindia.com/archivecontents.asp')
        feeds = []
        title = None
        bogus = True
        for table in soup.findAll('table'):
            if title is None:
                td = table.find('td', background="images/content_band1.jpg")
                if td is not None:
                    title = self.tag_to_string(td, False)
                    title = title.replace(u'\xa0', u'').strip()
                    if 'Cover Story' in title and bogus:
                        bogus = False
                        title = None
            else:
                articles = []
                for a in table.findAll('a', href=True):
                    if a.find('img') is not None:
                        continue
                    atitle = self.tag_to_string(a, use_alt=False)
                    desc = a.findNextSibling('font', attrs={'class':'fontintro'})
                    if desc is not None:
                        desc = self.tag_to_string(desc)
                    if not desc:
                        desc = ''
                    articles.append({
                            'title':atitle,
                            'description': desc,
                            'content': '',
                            'url':'http://www.outlookindia.com/'+a['href'],
                            'date': '',
                                     })
                feeds.append((title, articles))
                title = None 
        return feeds
-    def postprocess_html(self, soup, first_fetch):
+    title          = 'Outlook India'
-        bad = []
+    __author__     = 'Kovid Goyal and Sujata Raman'
-        for table in soup.findAll('table'):
+    description    = 'Weekly news and current affairs in India'
-            if table.find(text=re.compile(r'\(\d+ of \d+\)')):
+    no_stylesheets = True
-                bad.append(table)
+    encoding       = 'utf-8'
-        for b in bad:
+    language = _('English')
-            b.extract()
+    keep_only_tags = [
-        soup = soup.findAll('html')[0]
+                      dict(name='div', attrs={'id':["ctl00_cphpagemiddle_reparticle_ctl00_divfullstorytext","ctl00_cphpagemiddle_reparticle_ctl00_divartpic","ctl00_cphpagemiddle_reparticle_ctl00_divfspheading", "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccaption",  "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccredit","ctl00_cphpagemiddle_reparticle_ctl00_divfspintro", "ctl00_cphpagemiddle_reparticle_ctl00_divartbyline", ]}),
-        for t in soup.findAll(['table', 'tr', 'td']):
+                           ]
-            t.name = 'div' 
+    remove_tags = [dict(name=['script','object'])]
-        return soup
+
-    
+    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        # This site sends article titles in the cookie which occasionally
        # contain non ascii characters causing httplib to fail. Instead just
        # disable cookies as they're not needed for download. Proper solution
        # would be to implement a unicode aware cookie jar
        br.set_cookiejar(None)
        return br
    def parse_index(self):
        soup = self.index_to_soup('http://www.outlookindia.com/issues.aspx')
 # find cover pic
        div = soup.find('div', attrs={'class':re.compile('cententcellpadding')})
        if div is None: return None
        a = div.find('a')
        if a is not None:
            href =  'http://www.outlookindia.com/' + a['href']
        soup = self.index_to_soup(href)
        cover = soup.find('img', attrs={'id':"ctl00_cphpagemiddle_dlissues_ctl00_imgcoverpic"}, src=True)
        if cover is not None:
            self.cover_url = cover['src']
 # end find cover pic
        div = soup.find('table', attrs={'id':re.compile('ctl00_cphpagemiddle_dlissues')})
        if div is None: return None
        a = div.find('a')
        if a is not None:
            href =  'http://www.outlookindia.com/' + a['href']
        soup = self.index_to_soup(href)
        articles = []
        for a in soup.findAll('a', attrs={'class':'contentpgsubheadinglink'}):
            if a and a.has_key('href'):
                url = 'http://www.outlookindia.com/' + a['href']
            else:
                url =''
            title = self.tag_to_string(a)
            date = ''
            description = ''
            articles.append({
                                 'title':title,
                                 'date':date,
                                 'url':url,
                                 'description':description
                                })
        return [('Current Issue', articles)]
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return self.adeify_images(soup)
    def postrocess_html(self, soup, first):
            for tag in soup.findAll(name=['table', 'tr', 'td','tbody']):
                tag.name = 'div'
            return soup