...

2025-07-08 02:34:06 -04:00 · 2014-06-07 20:36:00 +05:30 · 2014-06-07 20:36:00 +05:30 · 70ec3dba42
commit 70ec3dba42
parent bb77b5a27e
1 changed files with 13 additions and 44 deletions
--- a/recipes/outlook_india.recipe
+++ b/recipes/outlook_india.recipe
@ -39,7 +39,9 @@ class OutlookIndia(BasicNewsRecipe):
                              "ctl00_cphpagemiddle_divregulars",
                              "ctl00_cphpagemiddle_divquotes"]}),
                    ]
-    remove_tags = [dict(name=['script','object','hr']),]
+    remove_tags = [
+        dict(name=['script','object','hr']),
+    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
@ -51,7 +53,6 @@ class OutlookIndia(BasicNewsRecipe):
        return br

    def parse_index(self):
-
        soup = self.index_to_soup('http://www.outlookindia.com/issues.aspx')
        # find cover pic
        cover = soup.find('img', attrs={'id':"ctl00_cphpagemiddle_dlissues_ctl00_imgcoverpic"}, src=True)
@ -62,51 +63,19 @@ class OutlookIndia(BasicNewsRecipe):
        # end find cover pic
        # find current issue
        div = soup.find('div', attrs={'class':re.compile('cententcellpadding')})
-
-        if div is None:
-            return None
-        href = div.find('a')['href']
+        soup = self.index_to_soup(div.find('a')['href'])
        # end find current issue
-        soup = self.index_to_soup(href)

        articles = []
-
-        for a in soup.findAll('a', attrs={'class':['contentpgsubheadinglink',"contentpgtext",]}):
-            if a and 'href' in a:
-
+        for a in soup.findAll('a', href=True, attrs={'class':['contentpgsubheadinglink',"contentpgtext",]}) + \
+                soup.findAll('a', href=True, attrs={'id':["ctl00_cphpageleft_hlglitterati","ctl00_cphpageleft_hlposcape",]}):
+            if a['href'].startswith('http:'):
+                continue  # Clarification type article, has different markup
            url = 'http://www.outlookindia.com/' + a['href']
-            else:
-                url =''
-
            title = self.tag_to_string(a)
-
-            desc = ''
-            date = ''
-            articles.append({
-                                 'title':title,
-                                 'date':date,
-                                 'url':url,
-                                 'description':desc,
-                                })
-        for a in soup.findAll('a', attrs={'id':["ctl00_cphpageleft_hlglitterati","ctl00_cphpageleft_hlposcape",]}):
-
-            if a and 'href' in a:
-
-                url = 'http://www.outlookindia.com/' + a['href']
-            else:
-                url =''
-
-            title = self.tag_to_string(a)
-
-            desc = ''
-            date = ''
-            articles.append({
-                                 'title':title,
-                                 'date':date,
-                                 'url':url,
-                                 'description':desc,
-                                })
-
+            self.log('Found article:', title, 'at', url)
+            articles.append({'title':title, 'date':'', 'url':url, 'description':''})
+        if articles:
            return [('Current Issue', articles)]

    def preprocess_html(self, soup):