import re from calibre.web.feeds.news import BasicNewsRecipe class TimesOfIndia(BasicNewsRecipe): title = u'Times of India' language = 'en_IN' __author__ = 'Kovid Goyal' oldest_article = 1 #days max_articles_per_feed = 25 no_stylesheets = True keep_only_tags = [{'class':['maintable12', 'prttabl']}] remove_tags = [ dict(style=lambda x: x and 'float' in x), {'class':['prvnxtbg', 'footbdrin', 'bcclftr']}, ] feeds = [ ('Top Stories', 'http://timesofindia.indiatimes.com/rssfeedstopstories.cms'), ('India', 'http://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms'), ('World', 'http://timesofindia.indiatimes.com/rssfeeds/296589292.cms'), ('Mumbai', 'http://timesofindia.indiatimes.com/rssfeeds/-2128838597.cms'), ('Entertainment', 'http://timesofindia.indiatimes.com/rssfeeds/1081479906.cms'), ('Cricket', 'http://timesofindia.indiatimes.com/rssfeeds/4719161.cms'), ('Sunday TOI', 'http://timesofindia.indiatimes.com/rssfeeds/1945062111.cms'), ('Life and Style', 'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms'), ('Business', 'http://timesofindia.indiatimes.com/rssfeeds/1898055.cms'), ('Mad Mad World', 'http://timesofindia.indiatimes.com/rssfeeds/2178430.cms'), ('Most Read', 'http://timesofindia.indiatimes.com/rssfeedmostread.cms') ] def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if '/0Ltimesofindia' in url: url = url.partition('/0L')[-1] url = url.replace('0B', '.').replace('0N', '.com').replace('0C', '/').replace('0E', '-') url = 'http://' + url.rpartition('/')[0] match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url) if match is not None: num = match.group(1) num = re.sub(r'[^0-9]', '', num) return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' % num) else: cms = re.search(r'/(\d+)\.cms', url) if cms is not None: return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' % cms.group(1)) return url def preprocess_html(self, soup): return soup