Fix #905794 (Many times of india news articles dont appear)

2025-07-09 03:04:10 -04:00 · 2011-12-18 09:32:53 +05:30 · 2011-12-18 09:32:53 +05:30 · 6929163527
commit 6929163527
parent b0e9e8f349
1 changed files with 26 additions and 22 deletions
--- a/recipes/toi.recipe
+++ b/recipes/toi.recipe
@ -1,4 +1,4 @@
-import re
+import re, urllib
 from calibre.web.feeds.news import BasicNewsRecipe
 class TimesOfIndia(BasicNewsRecipe):
@ -17,7 +17,9 @@ class TimesOfIndia(BasicNewsRecipe):
            ]
    remove_tags = [
            {'class':re.compile('tabsintbgshow|prvnxtbg')},
-            {'id':['fbrecommend', 'relmaindiv']}
+            {'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
                'gpls', 'auim']},
            {'class':['twitter-share-button', 'cmtmn']},
            ]
    feeds          = [
@ -46,25 +48,27 @@ class TimesOfIndia(BasicNewsRecipe):
 ]
    def get_article_url(self, article):
-        # Times of India sometimes serves an ad page instead of the article,
+        try:
-        # this code, detects and circumvents that
+            s = article.summary
-        url = BasicNewsRecipe.get_article_url(self, article)
+            return urllib.unquote(
-        if '/0Ltimesofindia' in url:
+                re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
-            url = url.partition('/0L')[-1]
+        except:
-            url = url.replace('0B', '.').replace('0N', '.com').replace('0C',
+            pass
-                    '/').replace('0E', '-')
+        link = article.get('link', None)
-            url = 'http://' + url.rpartition('/')[0]
+        if link and link.split('/')[-1]=="story01.htm":
-            match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url)
+            link=link.split('/')[-2]
-            if match is not None:
+            encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
-                num = match.group(1)
+                    '0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
-                num = re.sub(r'[^0-9]', '', num)
+            for k, v in encoding.iteritems():
-                return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
+                link = link.replace(k, v)
-                    num)
+            return link
        else:
            cms = re.search(r'/(\d+)\.cms', url)
            if cms is not None:
                return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
                    cms.group(1))
-        return url
+    def print_version(self, url):
        return url + '?prtpage=1'
    def preprocess_html(self, soup, *args):
        byl = soup.find(attrs={'class':'byline'})
        if byl is not None:
            for l in byl.findAll('label'):
                l.extract()
        return soup