mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #905794 (Many times of india news articles dont appear)
This commit is contained in:
parent
b0e9e8f349
commit
6929163527
@ -1,4 +1,4 @@
|
||||
import re
|
||||
import re, urllib
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
class TimesOfIndia(BasicNewsRecipe):
|
||||
@ -17,7 +17,9 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
]
|
||||
remove_tags = [
|
||||
{'class':re.compile('tabsintbgshow|prvnxtbg')},
|
||||
{'id':['fbrecommend', 'relmaindiv']}
|
||||
{'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
|
||||
'gpls', 'auim']},
|
||||
{'class':['twitter-share-button', 'cmtmn']},
|
||||
]
|
||||
|
||||
feeds = [
|
||||
@ -46,25 +48,27 @@ class TimesOfIndia(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def get_article_url(self, article):
|
||||
# Times of India sometimes serves an ad page instead of the article,
|
||||
# this code, detects and circumvents that
|
||||
url = BasicNewsRecipe.get_article_url(self, article)
|
||||
if '/0Ltimesofindia' in url:
|
||||
url = url.partition('/0L')[-1]
|
||||
url = url.replace('0B', '.').replace('0N', '.com').replace('0C',
|
||||
'/').replace('0E', '-')
|
||||
url = 'http://' + url.rpartition('/')[0]
|
||||
match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url)
|
||||
if match is not None:
|
||||
num = match.group(1)
|
||||
num = re.sub(r'[^0-9]', '', num)
|
||||
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
||||
num)
|
||||
else:
|
||||
cms = re.search(r'/(\d+)\.cms', url)
|
||||
if cms is not None:
|
||||
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
||||
cms.group(1))
|
||||
try:
|
||||
s = article.summary
|
||||
return urllib.unquote(
|
||||
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||
except:
|
||||
pass
|
||||
link = article.get('link', None)
|
||||
if link and link.split('/')[-1]=="story01.htm":
|
||||
link=link.split('/')[-2]
|
||||
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
|
||||
for k, v in encoding.iteritems():
|
||||
link = link.replace(k, v)
|
||||
return link
|
||||
|
||||
return url
|
||||
def print_version(self, url):
|
||||
return url + '?prtpage=1'
|
||||
|
||||
def preprocess_html(self, soup, *args):
|
||||
byl = soup.find(attrs={'class':'byline'})
|
||||
if byl is not None:
|
||||
for l in byl.findAll('label'):
|
||||
l.extract()
|
||||
return soup
|
||||
|
Loading…
x
Reference in New Issue
Block a user