import re, urllib from calibre.web.feeds.news import BasicNewsRecipe class TimesOfIndia(BasicNewsRecipe): title = u'Times of India' language = 'en_IN' __author__ = 'Kovid Goyal' oldest_article = 1 #days max_articles_per_feed = 25 no_stylesheets = True remove_attributes = ['style'] keep_only_tags = [ {'class':re.compile(r'maintable12|prttabl')}, {'id':['mod-article-header', 'mod-a-body-after-first-para', 'mod-a-body-first-para']}, ] remove_tags = [ {'class':re.compile('tabsintbgshow|prvnxtbg')}, {'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv', 'gpls', 'auim']}, {'class':['twitter-share-button', 'cmtmn']}, ] feeds = [ ('Top Stories', 'http://timesofindia.indiatimes.com/rssfeedstopstories.cms'), ('India', 'http://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms'), ('World', 'http://timesofindia.indiatimes.com/rssfeeds/296589292.cms'), ('Mumbai', 'http://timesofindia.indiatimes.com/rssfeeds/-2128838597.cms'), ('Entertainment', 'http://timesofindia.indiatimes.com/rssfeeds/1081479906.cms'), ('Cricket', 'http://timesofindia.indiatimes.com/rssfeeds/4719161.cms'), ('Sunday TOI', 'http://timesofindia.indiatimes.com/rssfeeds/1945062111.cms'), ('Life and Style', 'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms'), ('Business', 'http://timesofindia.indiatimes.com/rssfeeds/1898055.cms'), ('Mad Mad World', 'http://timesofindia.indiatimes.com/rssfeeds/2178430.cms'), ('Most Read', 'http://timesofindia.indiatimes.com/rssfeedmostread.cms') ] def get_article_url(self, article): try: s = article.summary return urllib.unquote( re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) except: pass link = article.get('link', None) if link and link.split('/')[-1]=="story01.htm": link=link.split('/')[-2] encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&', '0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'} for k, v in encoding.iteritems(): link = link.replace(k, v) return link def print_version(self, url): return url + '?prtpage=1' def preprocess_html(self, soup, *args): byl = soup.find(attrs={'class':'byline'}) if byl is not None: for l in byl.findAll('label'): l.extract() return soup