From 3d455a02eadf71f9d57ed2c451c35d6033bb9ef9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 16 Feb 2009 13:48:58 -0800 Subject: [PATCH] New recipe for The Hindu by Kovid Goyal --- src/calibre/web/feeds/news.py | 2 +- src/calibre/web/feeds/recipes/__init__.py | 1 + src/calibre/web/feeds/recipes/recipe_hindu.py | 47 +++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_hindu.py diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index edcf315d62..4773d551c3 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -190,7 +190,7 @@ class BasicNewsRecipe(object, LoggingInterface): #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`. #: For example:: #: - #: remove_tags_before = [dict(id='content')] + #: remove_tags_before = dict(id='content') #: #: will remove all #: tags before the first element with `id="content"`. diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index dd2b81c3a8..3723483fb7 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -29,6 +29,7 @@ recipe_modules = ['recipe_' + r for r in ( 'jb_online', 'estadao', 'o_globo', 'vijesti', 'elmundo', 'the_oz', 'honoluluadvertiser', 'starbulletin', 'exiled', 'indy_star', 'dna', 'pobjeda', 'chicago_breaking_news', 'glasgow_herald', 'linuxdevices', + 'hindu' )] import re, imp, inspect, time, os diff --git a/src/calibre/web/feeds/recipes/recipe_hindu.py b/src/calibre/web/feeds/recipes/recipe_hindu.py new file mode 100644 index 0000000000..073eb7afd1 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_hindu.py @@ -0,0 +1,47 @@ +from __future__ import with_statement +__license__ = 'GPL 3' +__copyright__ = '2009, Kovid Goyal ' + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class TheHindu(BasicNewsRecipe): + title = u'The Hindu' + language = _('English') + oldest_article = 7 + __author__ = _('Kovid Goyal') + max_articles_per_feed = 100 + + remove_tags_before = {'name':'font', 'class':'storyhead'} + preprocess_regexps = [ + (re.compile(r'.*', re.DOTALL), + lambda match: ''), + ] + + feeds = [ + (u'Main - Font Page', u'http://www.hindu.com/rss/01hdline.xml'), + (u'Main - National', u'http://www.hindu.com/rss/02hdline.xml'), + (u'Main - International', u'http://www.hindu.com/rss/03hdline.xml'), + (u'Main - Opinion', u'http://www.hindu.com/rss/05hdline.xml'), + (u'Main - Business', u'http://www.hindu.com/rss/06hdline.xml'), + (u'Main - Sport', u'http://www.hindu.com/rss/07hdline.xml'), + (u'Main - Weather / Religion / Crossword / Cartoon', + u'http://www.hindu.com/rss/10hdline.xml'), + (u'Main - Engagements', u'http://www.hindu.com/rss/26hdline.xml'), + (u'Supplement - Literary Review', + u'http://www.hindu.com/rss/lrhdline.xml'), + (u'Supplement - Sunday Magazine', + u'http://www.hindu.com/rss/maghdline.xml'), + (u'Supplement - Open Page', u'http://www.hindu.com/rss/ophdline.xml'), + (u'Supplement - Business Review', + u'http://www.hindu.com/rss/bizhdline.xml'), + (u'Supplement - Book Review', + u'http://www.hindu.com/rss/brhdline.xml'), + (u'Supplement - Science & Technology', + u'http://www.hindu.com/rss/setahdline.xml') + ] + + def postprocess_html(self, soup, first_fetch): + for t in soup.findAll(['table', 'tr', 'td']): + t.name = 'div' + return soup \ No newline at end of file