Update The Hindu

2025-09-29 15:31:08 -04:00 · 2013-03-25 15:25:29 +05:30 · 2013-03-25 15:25:29 +05:30 · c2bc85a3ad
commit c2bc85a3ad
parent 4e655fa8cc
1 changed files with 28 additions and 31 deletions
--- a/recipes/hindu.recipe
+++ b/recipes/hindu.recipe
@ -2,7 +2,6 @@ from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'

-import time
 from calibre.web.feeds.news import BasicNewsRecipe

 class TheHindu(BasicNewsRecipe):
@ -14,44 +13,42 @@ class TheHindu(BasicNewsRecipe):
    max_articles_per_feed = 100
    no_stylesheets = True

-    keep_only_tags = [dict(id='content')]
-    remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
-            dict(id=['email-section', 'right-column', 'printfooter', 'topover',
-                     'slidebox', 'th_footer'])]
+    auto_cleanup = True
+

    extra_css = '.photo-caption { font-size: smaller }'

-    def preprocess_raw_html(self, raw, url):
-        return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
-
-    def postprocess_html(self, soup, first_fetch):
-        for t in soup.findAll(['table', 'tr', 'td','center']):
-            t.name = 'div'
-        return soup
-
    def parse_index(self):
-        today = time.strftime('%Y-%m-%d')
-        soup = self.index_to_soup(
-                'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
-        div = soup.find(id='left-column')
-        feeds = []
+        soup = self.index_to_soup('http://www.thehindu.com/todays-paper/')
+        div = soup.find('div', attrs={'id':'left-column'})
+        soup.find(id='subnav-tpbar').extract()
+
+
+
        current_section = None
        current_articles = []
-        for x in div.findAll(['h3', 'div']):
-            if current_section and x.get('class', '') == 'tpaper':
-                a = x.find('a', href=True)
-                if a is not None:
-                    title = self.tag_to_string(a)
-                    self.log('\tFound article:', title)
-                    current_articles.append({'url':a['href']+'?css=print',
-                        'title':title, 'date': '',
-                        'description':''})
-            if x.name == 'h3':
-                if current_section and current_articles:
+        feeds = []
+        for x in div.findAll(['a', 'span']):
+            if x.name == 'span' and x['class'] == 's-link':
+                # Section heading found
+                if current_articles and current_section:
                    feeds.append((current_section, current_articles))
                current_section = self.tag_to_string(x)
-                self.log('Found section:', current_section)
                current_articles = []
+                self.log('\tFound section:', current_section)
+            elif x.name == 'a':
+
+                        title = self.tag_to_string(x)
+                        url = x.get('href', False)
+                        if not url or not title:
+                            continue
+                        self.log('\t\tFound article:', title)
+                        self.log('\t\t\t', url)
+                        current_articles.append({'title': title, 'url':url,
+                            'description':'', 'date':''})
+
+        if current_articles and current_section:
+             feeds.append((current_section, current_articles))
+
        return feeds

-