Fix Times of India

This commit is contained in:
Kovid Goyal 2011-09-24 10:52:22 -06:00
parent 6bf872b5be
commit 0f138cd43c

View File

@ -9,11 +9,12 @@ class TimesOfIndia(BasicNewsRecipe):
max_articles_per_feed = 25 max_articles_per_feed = 25
no_stylesheets = True no_stylesheets = True
keep_only_tags = [{'class':['maintable12', 'prttabl']}] remove_attributes = ['style']
keep_only_tags = [{'class':re.compile(r'maintable12|prttabl')}]
remove_tags = [ remove_tags = [
dict(style=lambda x: x and 'float' in x), {'class':re.compile('tabsintbgshow|prvnxtbg')},
{'class':['prvnxtbg', 'footbdrin', 'bcclftr']}, {'id':['fbrecommend', 'relmaindiv']}
] ]
feeds = [ feeds = [
('Top Stories', ('Top Stories',
@ -41,6 +42,8 @@ class TimesOfIndia(BasicNewsRecipe):
] ]
def get_article_url(self, article): def get_article_url(self, article):
# Times of India sometimes serves an ad page instead of the article,
# this code, detects and circumvents that
url = BasicNewsRecipe.get_article_url(self, article) url = BasicNewsRecipe.get_article_url(self, article)
if '/0Ltimesofindia' in url: if '/0Ltimesofindia' in url:
url = url.partition('/0L')[-1] url = url.partition('/0L')[-1]
@ -61,6 +64,3 @@ class TimesOfIndia(BasicNewsRecipe):
return url return url
def preprocess_html(self, soup):
return soup