mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #905794 (Many times of india news articles dont appear)
This commit is contained in:
parent
b0e9e8f349
commit
6929163527
@ -1,4 +1,4 @@
|
|||||||
import re
|
import re, urllib
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
class TimesOfIndia(BasicNewsRecipe):
|
class TimesOfIndia(BasicNewsRecipe):
|
||||||
@ -17,7 +17,9 @@ class TimesOfIndia(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
{'class':re.compile('tabsintbgshow|prvnxtbg')},
|
{'class':re.compile('tabsintbgshow|prvnxtbg')},
|
||||||
{'id':['fbrecommend', 'relmaindiv']}
|
{'id':['fbrecommend', 'relmaindiv', 'shretxt', 'fbrecos', 'twtdiv',
|
||||||
|
'gpls', 'auim']},
|
||||||
|
{'class':['twitter-share-button', 'cmtmn']},
|
||||||
]
|
]
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
@ -46,25 +48,27 @@ class TimesOfIndia(BasicNewsRecipe):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
# Times of India sometimes serves an ad page instead of the article,
|
try:
|
||||||
# this code, detects and circumvents that
|
s = article.summary
|
||||||
url = BasicNewsRecipe.get_article_url(self, article)
|
return urllib.unquote(
|
||||||
if '/0Ltimesofindia' in url:
|
re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
|
||||||
url = url.partition('/0L')[-1]
|
except:
|
||||||
url = url.replace('0B', '.').replace('0N', '.com').replace('0C',
|
pass
|
||||||
'/').replace('0E', '-')
|
link = article.get('link', None)
|
||||||
url = 'http://' + url.rpartition('/')[0]
|
if link and link.split('/')[-1]=="story01.htm":
|
||||||
match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url)
|
link=link.split('/')[-2]
|
||||||
if match is not None:
|
encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
|
||||||
num = match.group(1)
|
'0D': '?', '0E': '-', '0N': '.com', '0L': 'http://'}
|
||||||
num = re.sub(r'[^0-9]', '', num)
|
for k, v in encoding.iteritems():
|
||||||
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
link = link.replace(k, v)
|
||||||
num)
|
return link
|
||||||
else:
|
|
||||||
cms = re.search(r'/(\d+)\.cms', url)
|
|
||||||
if cms is not None:
|
|
||||||
return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' %
|
|
||||||
cms.group(1))
|
|
||||||
|
|
||||||
return url
|
def print_version(self, url):
|
||||||
|
return url + '?prtpage=1'
|
||||||
|
|
||||||
|
def preprocess_html(self, soup, *args):
|
||||||
|
byl = soup.find(attrs={'class':'byline'})
|
||||||
|
if byl is not None:
|
||||||
|
for l in byl.findAll('label'):
|
||||||
|
l.extract()
|
||||||
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user