Completely rewrote Times of India recipe

* Now uses RSS feeds. More reliable * Corrected problem of unwanted content showing up at end of articles * Added more sections * Added more metadata
2025-07-09 03:04:10 -04:00 · 2014-07-19 00:02:06 +05:30 · 2014-07-19 00:02:06 +05:30 · 607b501d21
commit 607b501d21
parent 1ef1bcd9a9
1 changed files with 67 additions and 53 deletions
--- a/recipes/toi.recipe
+++ b/recipes/toi.recipe
@ -1,58 +1,72 @@
-# vim:fileencoding=utf-8
+__license__   = 'GPL v3'
 __copyright__ = '2008-2014, Karthik <hashkendistro@gmail.com>'
 '''
 timesofindia.indiatimes.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 from lxml import html
-allowed_sections = {'Top Headlines', 'Opinion', 'Science', 'Education', 'US', 'Pakistan', 'India Business', 'Tech News', 'Cricket', 'Bollywood'}
+class TheEconomicTimes(BasicNewsRecipe):
-
+    title                  = 'The Times of India'
-class TimesOfIndia(BasicNewsRecipe):
+    __author__             = 'Karthik K'
-    title          = u'Times of India Headlines'
+    description            = 'News from the Indian daily Times of India'
-    language       = 'en'
+    publisher              = 'timesofindia.indiatimes.com'
-    description    = 'Headline news from the Indian daily Times of India'
+    category               = 'news, finances, politics, sports, business, entertainment, India'
-    __author__     = 'Kovid Goyal'
+    oldest_article         = 1
-
+    max_articles_per_feed  = 100
-    no_stylesheets = True
+    no_stylesheets         = True
-    no_javascript = True
+    use_embedded_content   = False
-    keep_only_tags = [dict(name='h1'), dict(id=['storydiv', 'contentarea'])]
+    simultaneous_downloads = 1
-    remove_tags = [
+    encoding               = 'utf-8'
-        dict(name='div', attrs={'class':['video_list', 'rightpart', 'clearfix mTop15', 'footer_slider', 'read_more', 'flR', 'hide_new']}),
+    language               = 'en_IN'
-        dict(name='div', attrs={'id':[
+    publication_type       = 'newspaper'
-            'most_pop', 'relartstory', 'slidebox', 'tmpFbokk', 'twittersource',
+    masthead_url           = 'http://timesofindia.indiatimes.com/photo.cms?msid=2419189'
-            'reportAbuseDiv', 'result', 'yahoobuzzsyn', 'fb-root']}),
+    extra_css              = """
-        dict(style='float:right;margin-left:5px;'),
+                                 body{font-family: Arial,Helvetica,sans-serif}
-    ]
+                                 .foto_mg{font-size: 60%; 
-
+                                          font-weight: 700;}
-    def parse_index(self):
+                                 h1{font-size: 150%;}
-        index = 'http://timesofindia.indiatimes.com/home/headlines'
+                                 artdate{font-size: 60%}
-        raw = self.index_to_soup(index, raw=True)
+                                 artag{font-size: 60%}
-        root = html.fromstring(raw)
+                                 div.storycontent{padding-top: 10px}
-
+                             """
-        feeds = []
+    conversion_options     = {'comment'          : description, 
-        current_section = None
+                              'tags'             : category,
-        current_articles = []
+                              'publisher'        : publisher,
-
+                              'language'         : language
-        toc = root.xpath('//div[@align="center"]/descendant::table[@class="cnt"]')[0]
+                             }
-
+    remove_tags_before     = dict(name='h1')
-        for x in toc.xpath('descendant::*[name()="h3" or (name()="ul" and @class="content")]'):
+    remove_tags_after      = dict(name='div', attrs={'class':'storycontent'})
-            if x.tag == 'h3':
+    remove_attributes      = ['xmlns']
-                if current_articles and current_section in allowed_sections:
+    feeds                  = [('Recent Stories', 'http://timesofindia.indiatimes.com/rssfeeds/1221656.cms'),
-                    feeds.append((current_section, current_articles))
+                              ('India', 'http://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms'),
-                current_section = html.tostring(x, method='text', encoding=unicode).strip()
+                              ('World', 'http://timesofindia.indiatimes.com/rssfeeds/296589292.cms'),
-                current_articles = []
+                              ('Business', 'http://timesofindia.indiatimes.com/rssfeeds/1898055.cms'),
-                self.log(current_section)
+                              ('Cricket', 'http://timesofindia.indiatimes.com/rssfeeds/4719161.cms'),
-            else:
+                              ('Sports', 'http://timesofindia.indiatimes.com/rssfeeds/4719148.cms'),
-                for a in x.xpath('descendant::li/descendant::a[@href]'):
+                              ('Tech', 'http://timesofindia.indiatimes.com/rssfeeds/5880659.cms'),
-                    title = html.tostring(a, method='text', encoding=unicode).strip()
+                              ('Education', 'http://timesofindia.indiatimes.com/rssfeeds/913168846.cms'),
-                    url = a.get('href')
+                              ('Science', 'http://timesofindia.indiatimes.com/rssfeeds/-2128672765.cms'),
-                    if url.startswith('/'):
+                              ('Opinion', 'http://timesofindia.indiatimes.com/rssfeeds/784865811.cms'),
-                        url = 'http://timesofindia.indiatimes.com' + url
+                              ('Entertainment', 'http://timesofindia.indiatimes.com/rssfeeds/1081479906.cms')]
                    self.log('  ', title)
                    current_articles.append({'title':title, 'url':url})
                self.log('')
        if current_articles and current_section in allowed_sections:
            feeds.append((current_section, current_articles))
        return feeds
    #Uses the mobile print version. For web print version use 'http://timesofindia.indiatimes.com/articleshow/<article_id>?prtpage=1'
    def print_version(self, url):
        rest, sep, article_id = url.rpartition('/articleshow/')
        return 'http://m.timesofindia.com/PDAET/articleshow/' + article_id
    def get_article_url(self, article):
        rurl = article.get('guid',  None)
        if (rurl.find('/quickieslist/') > 0) or (rurl.find('/quickiearticleshow/') > 0):
            return None
        return rurl
    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return soup
    def postprocess_html(self, soup, first_fetch):
        return self.adeify_images(soup)