From 4aa11d3b3e7d5ebc69ec7204718799755a8e2c20 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 14 Apr 2024 19:28:01 +0530 Subject: [PATCH] update ORFonline --- recipes/observer_reach_foundation.recipe | 78 ++++++++++++++++-------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/recipes/observer_reach_foundation.recipe b/recipes/observer_reach_foundation.recipe index a2283d154d..f844708d62 100644 --- a/recipes/observer_reach_foundation.recipe +++ b/recipes/observer_reach_foundation.recipe @@ -1,8 +1,9 @@ +from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe, classes class ORF(BasicNewsRecipe): - title = u'Observer Research Foundation' + title = 'Observer Research Foundation' description = ( 'Set up in 1990, ORF seeks to lead and aid policy thinking towards building a strong and prosperous India' ' in a fair and equitable world. It helps discover and inform India’s choices, and carries Indian voices ' @@ -10,52 +11,75 @@ class ORF(BasicNewsRecipe): ) language = 'en_IN' __author__ = 'unkn0wn' - oldest_article = 7.5 # days - max_articles_per_feed = 25 encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True masthead_url = 'https://www.orfonline.org/wp-content/uploads/2015/09/Logo_ORF_JPEG.jpg' remove_attributes = ['style', 'height', 'width'] - ignore_duplicate_articles = {'url'} + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True + remove_empty_feeds = True + simultaneous_downloads = 1 + art_url = '' extra_css = ''' - .report-slider {font-size:small; color:#404040;} + img {display:block; margin:0 auto;} + .report-slider, .author_panel {font-size:small; color:#404040;} .report {font-size:small; font-weight:bold;} .excert-italic, .recent-block-people {font-style:italic; color:#202020;} blockquote, em {color:#202020;} + .espert_speak_panel {font-size:small;} ''' + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links ', link) + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + def get_browser(self): return BasicNewsRecipe.get_browser(self, user_agent='common_words/based') resolve_internal_links = True remove_empty_feeds = True - keep_only_tags = [classes('recent-updates-block recent-block-people')] + keep_only_tags = [ + dict(name='h1'), + classes('author_panel espert_speak_panel expert_panel_content') + ] remove_tags = [ classes( 'social socialshare comment-area-section telegramhtml post-tag ' - 'research-prev research-next' + 'research-prev research-next col-md-4 button_group sharethis-p tags' ) ] - feeds = [ - ('Commentaries', 'https://www.orfonline.org/content-type/commentary/feed/'), - ('Expert Speak', 'https://www.orfonline.org/expert-speak/feed/'), - ('Books and Monographs', 'https://www.orfonline.org/content-type/books/feed/'), - ('Event Reports', 'https://www.orfonline.org/content-type/event-reports/feed/'), - ('Events', 'https://www.orfonline.org/content-type/events/feed/'), - ('Forums', 'https://www.orfonline.org/content-type/forums/feed/'), - ('GP-ORF Series', 'https://www.orfonline.org/content-type/gp-orf-series/feed/'), - ('Issue Briefs & Special Reports', 'https://www.orfonline.org/content-type/issue-brief/feed/'), - ('Monitors', 'https://www.orfonline.org/content-type/monitors/feed/'), - ('Occasional Papers', 'https://www.orfonline.org/content-type/occasional-paper/feed/'), - ('Primer', 'https://www.orfonline.org/content-type/primer/feed/'), - ('Series', 'https://www.orfonline.org/content-type/series/feed/'), - ('Surveys & Polls', 'https://www.orfonline.org/content-type/surveys-polls/feed/'), - ('Young Voices', 'https://www.orfonline.org/content-type/young-voices/feed/'), - ] + feeds = [] - def print_version(self, url): - if 'marathi' in url or 'hindi' in url or 'bangla' in url: - return '' - return url + when = '170' # hours > 7 days + index = 'https://www.orfonline.org' + + sections = [ + 'expert-speak', 'books', 'event-reports', 'events', 'forums', 'gp-orf-series', 'issue-brief', 'monitors', + 'occasional-paper', 'primer', 'series', 'surveys-polls', 'young-voices', 'research' + ] + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en' + for sec in sections: + sec_index = index + '/' + sec + '/' + feeds.append((sec.capitalize(), a.format(when, quote(sec_index, safe='')))) + feeds.append(('Others', a.format(when, quote(index, safe='')))) + + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Observer Research Foundation', '')