Fix #905794 (Many times of india news articles dont appear)

This commit is contained in:
Kovid Goyal 2011-12-18 09:32:53 +05:30
parent b0e9e8f349
commit 6929163527

View File

@ -1,4 +1,4 @@
import re import re, urllib
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class TimesOfIndia(BasicNewsRecipe): class TimesOfIndia(BasicNewsRecipe):
@ -17,7 +17,9 @@ class TimesOfIndia(BasicNewsRecipe):
] ]
remove_tags = [ remove_tags = [
{'class':re.compile('tabsintbgshow|prvnxtbg')}, {'class':re.compile('tabsintbgshow|prvnxtbg')},
{'id':['fbrecommend', 'relmaindiv']} {'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
'gpls', 'auim']},
{'class':['twitter-share-button', 'cmtmn']},
] ]
feeds = [ feeds = [
@ -46,25 +48,27 @@ class TimesOfIndia(BasicNewsRecipe):
] ]
def get_article_url(self, article): def get_article_url(self, article):
# Times of India sometimes serves an ad page instead of the article, try:
# this code, detects and circumvents that s = article.summary
url = BasicNewsRecipe.get_article_url(self, article) return urllib.unquote(
if '/0Ltimesofindia' in url: re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
url = url.partition('/0L')[-1] except:
url = url.replace('0B', '.').replace('0N', '.com').replace('0C', pass
'/').replace('0E', '-') link = article.get('link', None)
url = 'http://' + url.rpartition('/')[0] if link and link.split('/')[-1]=="story01.htm":
match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url) link=link.split('/')[-2]
if match is not None: encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
num = match.group(1) '0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
num = re.sub(r'[^0-9]', '', num) for k, v in encoding.iteritems():
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' % link = link.replace(k, v)
num) return link
else:
cms = re.search(r'/(\d+)\.cms', url)
if cms is not None:
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
cms.group(1))
return url def print_version(self, url):
return url + '?prtpage=1'
def preprocess_html(self, soup, *args):
byl = soup.find(attrs={'class':'byline'})
if byl is not None:
for l in byl.findAll('label'):
l.extract()
return soup