From bc057a48b5ac0e27f8ac4fc4b1ed96c50cf5454f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Apr 2022 21:50:17 +0530 Subject: [PATCH] Update The Economic Times India --- recipes/theeconomictimes_india.recipe | 64 +++++++-------------------- 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/recipes/theeconomictimes_india.recipe b/recipes/theeconomictimes_india.recipe index 4d7e30cb5e..041af8a594 100644 --- a/recipes/theeconomictimes_india.recipe +++ b/recipes/theeconomictimes_india.recipe @@ -5,7 +5,7 @@ economictimes.indiatimes.com ''' -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes class TheEconomicTimes(BasicNewsRecipe): @@ -15,44 +15,26 @@ class TheEconomicTimes(BasicNewsRecipe): publisher = 'economictimes.indiatimes.com' category = 'news, finances, politics, India' oldest_article = 1 - max_articles_per_feed = 100 + max_articles_per_feed = 50 no_stylesheets = True use_embedded_content = False simultaneous_downloads = 1 encoding = 'utf-8' language = 'en_IN' + remove_attributes = ['style', 'height', 'width'] publication_type = 'newspaper' masthead_url = 'http://economictimes.indiatimes.com/photo/2676871.cms' - extra_css = """ - body{font-family: Arial,Helvetica,sans-serif} - .foto_mg{font-size: 60%; - font-weight: 700;} - h1{font-size: 150%;} - artdate{font-size: 60%} - artag{font-size: 60%} - div.storycontent{padding-top: 10px} - """ - conversion_options = {'comment': description, - 'tags': category, - 'publisher': publisher, - 'language': language - } - remove_tags_before = dict(name='article') - remove_tags_after = [dict(name='article')] - keep_only_tags = [dict(name='h1', attrs={'class': 'title'}), - dict(name='div', attrs={'class': 'bylineFull'}), - dict(name='div', attrs={'class': 'articleImg'}), - dict(name='div', attrs={'class': 'artText'}) - ] - remove_tags = [dict(name='div', attrs={'class': 'cmtLinks'}), - dict(name='div', attrs={'class': 'raltedTopics'}), - dict(name='div', attrs={'class': 'editorsPick'}), - dict(name='div', attrs={'class': 'articleImg etSpecial'}), - dict(name='div', attrs={'class': 'articleImg artAd'}), - dict(name='div', attrs={'class': 'appPromotion'}) - ] + ignore_duplicate_articles = {'title', 'url'} + extra_css = '.summary {font-weight:normal; font-size:normal; }' + + keep_only_tags = [ + dict(name='h1'), + classes('artByline artSyn artImg artText publisher publish_on slideshowPackage'), + ] + remove_tags = [ + classes('story_title storyCollection shareBar'), + ] - remove_attributes = ['xmlns'] feeds = [(u'Top Stories', u'http://economictimes.indiatimes.com/rssfeedstopstories.cms'), (u'News', u'http://economictimes.indiatimes.com/News/rssfeeds/1715249553.cms'), (u'Market', u'http://economictimes.indiatimes.com/Markets/markets/rssfeeds/1977021501.cms'), @@ -67,23 +49,7 @@ class TheEconomicTimes(BasicNewsRecipe): (u'NRI', u'http://economictimes.indiatimes.com/rssfeeds/7771250.cms') ] - # Uses the mobile print version. For web print version use - # 'http://economictimes.indiatimes.com/articleshow/?prtpage=1' - def print_version(self, url): - rest, sep, article_id = url.rpartition('/articleshow/') - # return 'http://m.economictimes.com/PDAET/articleshow/' + article_id - return 'http://economictimes.indiatimes.com/articleshow/' + article_id + '?prtpage=1' - - def get_article_url(self, article): - rurl = article.get('link', None) - if (rurl.find('/quickieslist/') > 0) or (rurl.find('/quickiearticleshow/') > 0): - return None - return rurl - def preprocess_html(self, soup): - for item in soup.findAll(style=True): - del item['style'] + for img in soup.findAll('img', attrs={'data-original': True}): + img['src'] = img['data-original'] return soup - - def postprocess_html(self, soup, first_fetch): - return self.adeify_images(soup)