diff --git a/recipes/toi.recipe b/recipes/toi.recipe index 8a772b6f9d..a44979dd4a 100644 --- a/recipes/toi.recipe +++ b/recipes/toi.recipe @@ -9,11 +9,12 @@ class TimesOfIndia(BasicNewsRecipe): max_articles_per_feed = 25 no_stylesheets = True - keep_only_tags = [{'class':['maintable12', 'prttabl']}] + remove_attributes = ['style'] + keep_only_tags = [{'class':re.compile(r'maintable12|prttabl')}] remove_tags = [ - dict(style=lambda x: x and 'float' in x), - {'class':['prvnxtbg', 'footbdrin', 'bcclftr']}, - ] + {'class':re.compile('tabsintbgshow|prvnxtbg')}, + {'id':['fbrecommend', 'relmaindiv']} + ] feeds = [ ('Top Stories', @@ -41,6 +42,8 @@ class TimesOfIndia(BasicNewsRecipe): ] def get_article_url(self, article): + # Times of India sometimes serves an ad page instead of the article, + # this code, detects and circumvents that url = BasicNewsRecipe.get_article_url(self, article) if '/0Ltimesofindia' in url: url = url.partition('/0L')[-1] @@ -61,6 +64,3 @@ class TimesOfIndia(BasicNewsRecipe): return url - - def preprocess_html(self, soup): - return soup