From 1c278e59c008ee4f7340b0b75497b11bf1926f72 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 27 Jun 2010 11:29:29 -0600 Subject: [PATCH] Fix #5969 (Hindu newspaper download problem) --- resources/recipes/hindu.recipe | 66 ++++++++++++++++------------------ 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/resources/recipes/hindu.recipe b/resources/recipes/hindu.recipe index 6c0d42660b..cc5305eb77 100644 --- a/resources/recipes/hindu.recipe +++ b/resources/recipes/hindu.recipe @@ -2,7 +2,7 @@ from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' -import re +import time from calibre.web.feeds.news import BasicNewsRecipe class TheHindu(BasicNewsRecipe): @@ -10,45 +10,41 @@ class TheHindu(BasicNewsRecipe): language = 'en_IN' oldest_article = 7 - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal' max_articles_per_feed = 100 no_stylesheets = True - remove_tags_before = {'name':'font', 'class':'storyhead'} - preprocess_regexps = [ - (re.compile(r'.*', re.DOTALL), - lambda match: ''), - ] - extra_css = ''' - .storyhead{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#000099;} - body{font-family:Verdana,Arial,Helvetica,sans-serif; font-size:x-small; text-align:left;} - ''' - feeds = [ - (u'Main - Front Page', u'http://www.hindu.com/rss/01hdline.xml'), - (u'Main - National', u'http://www.hindu.com/rss/02hdline.xml'), - (u'Main - International', u'http://www.hindu.com/rss/03hdline.xml'), - (u'Main - Opinion', u'http://www.hindu.com/rss/05hdline.xml'), - (u'Main - Business', u'http://www.hindu.com/rss/06hdline.xml'), - (u'Main - Sport', u'http://www.hindu.com/rss/07hdline.xml'), - (u'Main - Weather / Religion / Crossword / Cartoon', - u'http://www.hindu.com/rss/10hdline.xml'), - (u'Main - Engagements', u'http://www.hindu.com/rss/26hdline.xml'), - (u'Supplement - Literary Review', - u'http://www.hindu.com/rss/lrhdline.xml'), - (u'Supplement - Sunday Magazine', - u'http://www.hindu.com/rss/maghdline.xml'), - (u'Supplement - Open Page', u'http://www.hindu.com/rss/ophdline.xml'), - (u'Supplement - Business Review', - u'http://www.hindu.com/rss/bizhdline.xml'), - (u'Supplement - Book Review', - u'http://www.hindu.com/rss/brhdline.xml'), - (u'Supplement - Science & Technology', - u'http://www.hindu.com/rss/setahdline.xml') - ] + keep_only_tags = [dict(id='content')] + remove_tags = [dict(attrs={'class':['article-links', 'breadcr']}), + dict(id=['email-section', 'right-column', 'printfooter'])] + + extra_css = '.photo-caption { font-size: smaller }' def postprocess_html(self, soup, first_fetch): for t in soup.findAll(['table', 'tr', 'td','center']): t.name = 'div' - - return soup + + def parse_index(self): + today = time.strftime('%Y-%m-%d') + soup = self.index_to_soup( + 'http://www.thehindu.com/todays-paper/tp-index/?date=' + today) + div = soup.find(id='left-column') + feeds = [] + current_section = None + current_articles = [] + for x in div.findAll(['h3', 'div']): + if current_section and x.get('class', '') == 'tpaper': + a = x.find('a', href=True) + if a is not None: + current_articles.append({'url':a['href']+'?css=print', + 'title':self.tag_to_string(a), 'date': '', + 'description':''}) + if x.name == 'h3': + if current_section and current_articles: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x) + current_articles = [] + return feeds + +