From 94b62da2b1d7af520fe8f133b81968bf941e4ec0 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 10 Feb 2024 10:01:32 +0530 Subject: [PATCH] Update moneycontrol.recipe --- recipes/moneycontrol.recipe | 148 ++++++++++++++++++++++-------------- 1 file changed, 93 insertions(+), 55 deletions(-) diff --git a/recipes/moneycontrol.recipe b/recipes/moneycontrol.recipe index 3c8f0483ea..320398a975 100644 --- a/recipes/moneycontrol.recipe +++ b/recipes/moneycontrol.recipe @@ -1,65 +1,103 @@ -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) +from calibre.web.feeds.news import BasicNewsRecipe, classes +from urllib.parse import quote class MoneyControlRecipe(BasicNewsRecipe): - __license__ = 'GPL v3' - __author__ = 'kwetal' - language = 'en_IN' - locale = 'en_IN' - encoding = 'iso-8859-1' - version = 1 - title = u'Money Control' - publisher = u'moneycontrol.com' - category = u'News, Financial, India' - description = u'Financial news from India' - - oldest_article = 7 - max_articles_per_feed = 100 - use_embedded_content = False - + __author__ = 'unkn0wn' + description = 'Read the latest business news on the Indian economy, global market, upcoming IPOs and more.' + language = 'en_IN' + masthead_url = 'https://images.moneycontrol.com/images/ftpopup/moneyloginlogo.png' + encoding = 'utf-8' no_stylesheets = True remove_javascript = True + remove_attributes = ['width', 'height', 'float', 'style'] + + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True + resolve_internal_links = True + + extra_css = ''' + img {display:block; margin:0 auto;} + .article_image_wrapper { font-size:small; text-align:center; } + .articlename_join_follow, .author_wrapper, .FT_block_article { font-size:small; color:#404040; } + .article_desc { font-style:italic; color:#202020; } + ''' + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links ', link) + self.log('Found ', link) + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + keep_only_tags = [ + dict(name='div', attrs={'id':lambda x: x and x.startswith('article-')}) + ] + + remove_tags = [ + dict(name=['svg', 'style', 'button', 'script']), + dict(attrs={'id':['social_icon_impression', 'taboola-mid-article-thumbnails']}), + classes( + 'social_icons_wrapper mid-arti-ad lastPara related_stories_left_block social_icons_mobile_wrapper' + 'advSlotsWithoutGrayBox tags_wrapper maintextdiv page_right_wrapper stockwidget tech_newsletter' + ) + ] + + def preprocess_html(self, soup): + desc = soup.find(**classes('article_desc')) + if desc: + desc.name = 'p' + for wrap in soup.findAll(**classes('article_image_wrapper')): + for h2 in wrap.findAll('h2'): + h2.name = 'span' + for img in soup.findAll('img', attrs={'data-src':True}): + img['src'] = img['data-src'] + return soup feeds = [] + + when = 27 # hours + index = 'https://www.moneycontrol.com/' + + business_sections = [ + 'markets', 'stocks', 'ipo', 'budget', 'banks', 'moneycontrol-research', 'economy', 'earnings', 'real-estate', + 'personal-finance', 'commodities', 'trade', 'companies' + ] + + for sec in business_sections: + allinurl_a = index + 'news/business' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}{}&hl=en-IN&gl=IN&ceid=IN:en' + feeds.append((sec.capitalize(), a.format(when, quote(allinurl_a, safe=''), '%2F' + sec + '%2F'))) + feeds.append(('Business' , a.format(str(when), quote(allinurl_a, safe=''), ''))) + + news_sections = [ + 'india', 'world', 'opinion', 'politics', 'technology', 'trends', 'lifestyle' + ] + + for sec in news_sections: + allinurl_b = index + 'news' + b = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}{}&hl=en-IN&gl=IN&ceid=IN:en' + feeds.append((sec.capitalize(), a.format(str(when), quote(allinurl_b, safe=''), '%2F' + sec + '%2F'))) + feeds.append(('News', b.format(str(when), quote(allinurl_b, safe=''), ''))) feeds.append( - (u'Latest News', u'http://www.moneycontrol.com/rss/latestnews.xml')) - feeds.append( - (u'All Stories', u'http://www.moneycontrol.com/rss/allstories.xml')) + ('Others', 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en'.format(str(when), quote(index, safe=''))) + ) - def print_version(self, url): - return url.replace('/stocksnews.php?', '/news_print.php?') + '&sr_no=0' - - # The articles contain really horrible html. More than one and section, not properly closed tags, lots and lots of - # tags and some weird markup that crashes the conversion to ebook. Needs some drastic sanitizing - '''def preprocess_html(self, soup): - freshSoup = BeautifulSoup('') - - headline = soup.find('td', attrs = {'class': 'heading'}) - if headline: - h1 = new_tag(freshSoup, 'h1') - # Convert to string before adding it to the document! - h1.append(self.tag_to_string(headline)) - freshSoup.body.append(h1) - - for p in soup.findAll('p', attrs={'class': true}): - if ''.join(p['class']) == 'MsoNormal': - # We have some weird pagebreak marker here; it will not find all of them however - continue - - para = new_tag(freshSoup, 'p') - # Convert to string; this will loose all formatting but also all illegal markup - para.append(self.tag_to_string(p)) - - freshSoup.body.append(para) - - return freshSoup - ''' + def populate_article_metadata(self, article, soup, first): + div = soup.find('div', attrs={'data-io-article-url':True}) + if div: + article.url = div['data-io-article-url'] + desc = soup.find(**classes('article_desc')) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary + article.title = article.title.replace(' - Moneycontrol', '')