Update The Hindu

2025-09-29 15:31:08 -04:00 · 2013-03-25 15:25:29 +05:30 · 2013-03-25 15:25:29 +05:30 · c2bc85a3ad
commit c2bc85a3ad
parent 4e655fa8cc
1 changed files with 28 additions and 31 deletions
--- a/recipes/hindu.recipe
+++ b/recipes/hindu.recipe
@ -2,7 +2,6 @@ from __future__ import with_statement
 __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 import time
 from calibre.web.feeds.news import BasicNewsRecipe
 class TheHindu(BasicNewsRecipe):
@ -14,44 +13,42 @@ class TheHindu(BasicNewsRecipe):
    max_articles_per_feed = 100
    no_stylesheets = True
-    keep_only_tags = [dict(id='content')]
+    auto_cleanup = True
-    remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}),
+
            dict(id=['email-section', 'right-column', 'printfooter', 'topover',
                     'slidebox', 'th_footer'])]
    extra_css = '.photo-caption { font-size: smaller }'
    def preprocess_raw_html(self, raw, url):
        return raw.replace('<body><p>', '<p>').replace('</p></body>', '</p>')
    def postprocess_html(self, soup, first_fetch):
        for t in soup.findAll(['table', 'tr', 'td','center']):
            t.name = 'div'
        return soup
    def parse_index(self):
-        today = time.strftime('%Y-%m-%d')
+        soup = self.index_to_soup('http://www.thehindu.com/todays-paper/')
-        soup = self.index_to_soup(
+        div = soup.find('div', attrs={'id':'left-column'})
-                'http://www.thehindu.com/todays-paper/tp-index/?date=' + today)
+        soup.find(id='subnav-tpbar').extract()
-        div = soup.find(id='left-column')
+
-        feeds = []
+
        current_section = None
        current_articles = []
-        for x in div.findAll(['h3', 'div']):
+        feeds = []
-            if current_section and x.get('class', '') == 'tpaper':
+        for x in div.findAll(['a', 'span']):
-                a = x.find('a', href=True)
+            if x.name == 'span' and x['class'] == 's-link':
-                if a is not None:
+                # Section heading found
-                    title = self.tag_to_string(a)
+                if current_articles and current_section:
                    self.log('\tFound article:', title)
                    current_articles.append({'url':a['href']+'?css=print',
                        'title':title, 'date': '',
                        'description':''})
            if x.name == 'h3':
                if current_section and current_articles:
                    feeds.append((current_section, current_articles))
                current_section = self.tag_to_string(x)
                self.log('Found section:', current_section)
                current_articles = []
                self.log('\tFound section:', current_section)
            elif x.name == 'a':
                        title = self.tag_to_string(x)
                        url = x.get('href', False)
                        if not url or not title:
                            continue
                        self.log('\t\tFound article:', title)
                        self.log('\t\t\t', url)
                        current_articles.append({'title': title, 'url':url,
                            'description':'', 'date':''})
        if current_articles and current_section:
             feeds.append((current_section, current_articles))
        return feeds