Fix #905794 (Many times of india news articles dont appear)

2025-07-09 03:04:10 -04:00 · 2011-12-18 09:32:53 +05:30 · 2011-12-18 09:32:53 +05:30 · 6929163527
commit 6929163527
parent b0e9e8f349
1 changed files with 26 additions and 22 deletions
--- a/recipes/toi.recipe
+++ b/recipes/toi.recipe
@ -1,4 +1,4 @@
-import re
+import re, urllib
 from calibre.web.feeds.news import BasicNewsRecipe

 class TimesOfIndia(BasicNewsRecipe):
@ -17,7 +17,9 @@ class TimesOfIndia(BasicNewsRecipe):
            ]
    remove_tags = [
            {'class':re.compile('tabsintbgshow|prvnxtbg')},
-            {'id':['fbrecommend', 'relmaindiv']}
+            {'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
+                'gpls', 'auim']},
+            {'class':['twitter-share-button', 'cmtmn']},
            ]

    feeds          = [
@ -46,25 +48,27 @@ class TimesOfIndia(BasicNewsRecipe):
 ]

    def get_article_url(self, article):
-        # Times of India sometimes serves an ad page instead of the article,
-        # this code, detects and circumvents that
-        url = BasicNewsRecipe.get_article_url(self, article)
-        if '/0Ltimesofindia' in url:
-            url = url.partition('/0L')[-1]
-            url = url.replace('0B', '.').replace('0N', '.com').replace('0C',
-                    '/').replace('0E', '-')
-            url = 'http://' + url.rpartition('/')[0]
-            match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url)
-            if match is not None:
-                num = match.group(1)
-                num = re.sub(r'[^0-9]', '', num)
-                return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
-                    num)
-        else:
-            cms = re.search(r'/(\d+)\.cms', url)
-            if cms is not None:
-                return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
-                    cms.group(1))
+        try:
+            s = article.summary
+            return urllib.unquote(
+                re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
+        except:
+            pass
+        link = article.get('link', None)
+        if link and link.split('/')[-1]=="story01.htm":
+            link=link.split('/')[-2]
+            encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
+                    '0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
+            for k, v in encoding.iteritems():
+                link = link.replace(k, v)
+            return link

-        return url
+    def print_version(self, url):
+        return url + '?prtpage=1'

+    def preprocess_html(self, soup, *args):
+        byl = soup.find(attrs={'class':'byline'})
+        if byl is not None:
+            for l in byl.findAll('label'):
+                l.extract()
+        return soup