From 1b8872aae930df40c32a316eb66ee20529f3c6e2 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 15 Oct 2023 23:17:18 +0530 Subject: [PATCH 1/2] Project Syndicate recipe --- recipes/icons/project_syndicate.png | Bin 0 -> 440 bytes recipes/project_syndicate.recipe | 71 ++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 recipes/icons/project_syndicate.png create mode 100644 recipes/project_syndicate.recipe diff --git a/recipes/icons/project_syndicate.png b/recipes/icons/project_syndicate.png new file mode 100644 index 0000000000000000000000000000000000000000..7cd36a0cb20af71fe3e79d53474690b52489b5ab GIT binary patch literal 440 zcmV;p0Z0CcP)g`L?a}oP!PdV|AN8NB0s@G8!HTo~GVAKxPjU1-HBJ z3y}-Bj(Kcix#(B$9xFJH89c`hrb>YiMD|6#6xoQ}5IHS!QsioR`yz5%Qt{7q zMmE;e-|?zgr_jfA2|kJ2xPbZYnE8Bb=@o8@+{6n!!x4NOm!N;hK8*)>i2GRA)O({_ zh@2=Cl~z6%{Wz}SF|La29%hg@flJuMjRKxA+|*?pD{PkctF^cARphy{g`UU-kt-q} zi9926QRM4_ZA7k)N-)=IG-Jcd=f iC6K26@fYU*F24aK&1L61#USkf0000 Date: Mon, 16 Oct 2023 08:00:18 +0530 Subject: [PATCH 2/2] ... --- recipes/icons/newsminute.png | Bin 0 -> 633 bytes recipes/newsminute.recipe | 70 +++++++++++++++++++++++++++++++ recipes/project_syndicate.recipe | 6 +-- 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 recipes/icons/newsminute.png create mode 100644 recipes/newsminute.recipe diff --git a/recipes/icons/newsminute.png b/recipes/icons/newsminute.png new file mode 100644 index 0000000000000000000000000000000000000000..5544ac433e9cdeb067ab8acd2db2499ce5806230 GIT binary patch literal 633 zcmV-<0*3vGP)MBFIYG zwXn#dS`}3PK#LHfEZ&qkBTP`fH;uk|^SmuI)5&1Y>Mp+T$2s3QS5Q&6gQPS?pMW%q z76cv&YeDdW5_kLI+e*Fo^B4oe(YdSyrv=DL@RE4WhfC2CLp|Ux1IEB87>p}8u$J*3 zkm7#Cqqhfq4ZzkJs@4pk1VlL)=s#wvD!w0YJol}1ZmH`q&&QkcAp?+sKwq>pCz!IZ6*ASdpE2T(t3qsTIOD?|a;@LK4 zB8g|uQq1L0a6y&@BN9#ldL=+cBACrBpb6%3Im+|%7^ca();3D^E4TRL*S)|qpYI1X zVhDCr0=$BQld+i3=P72Vk>N1KY?jjOPv-LZD<^y&EuXSu@EF)%_`S>og|C@%%Mx<-R+X6ZXCMS?|DM!`!2caFP!TB-jHJYvnX T^5ykk00000NkvXXu0mjfPzxXb literal 0 HcmV?d00001 diff --git a/recipes/newsminute.recipe b/recipes/newsminute.recipe new file mode 100644 index 0000000000..ac675581a9 --- /dev/null +++ b/recipes/newsminute.recipe @@ -0,0 +1,70 @@ +from calibre.web.feeds.news import BasicNewsRecipe, classes +from calibre.ptempfile import PersistentTemporaryFile + + +class newsminute(BasicNewsRecipe): + title = 'The News Minute' + __author__ = 'unkn0wn' + description = ( + 'The News Minute is a digital news platform reporting and writing on issues in India, with a ' + 'specific focus on the 5 southern states. Our content includes news, ground reportage, news ' + 'analysis, opinion and blogs. Our core strengths include our deep access in the southern states, ' + 'incisive editorial acumen and insightful news analysis and opinions.' + ) + language = 'en_IN' + + no_stylesheets = True + remove_javascript = True + masthead_url = 'https://pkcindia.com/wp-content/uploads/2021/09/TMN-Logo-1.png' + ignore_duplicate_articles = {'title', 'url'} + resolve_internal_links = True + remove_empty_feeds = True + remove_attributes = ['style', 'height', 'width'] + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + try: + br.open(url) + except Exception as e: + url = e.hdrs.get('location') + soup = self.index_to_soup(url) + link = soup.find('a', href=True) + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/media/', 'podcast-' + ] + if any(x in link['href'] for x in skip_sections): + self.log('Aborting Article ', link['href']) + self.abort_article('skipping video links') + + self.log('Downloading ', link['href']) + html = br.open(link['href']).read() + pt = PersistentTemporaryFile('.html') + pt.write(html) + pt.close() + return pt.name + + keep_only_tags = [ + classes( + 'arr--section-name arr--story--headline-h1 arr--sub-headline arr--hero-image author-card-wrapper arr--story-page-card-wrapper' + ), + ] + + feeds = [] + + sections = [ + 'tamil-nadu', 'telangana', 'andhra-pradesh', 'karnataka', 'kerala' + ] + + for sec in sections: + a = 'https://news.google.com/rss/search?q=when:27h+allinurl:https%3A%2F%2Fwww.thenewsminute.com{}&hl=en-IN&gl=IN&ceid=IN:en' + feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F'))) + feeds.append(('Others', a.format(''))) + + def populate_article_metadata(self, article, soup, first): + article.title = article.title.replace(' - The News Minute', '') + + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-src':True}): + img['src'] = img['data-src'] + return soup diff --git a/recipes/project_syndicate.recipe b/recipes/project_syndicate.recipe index a87ea0d9a1..5091865961 100644 --- a/recipes/project_syndicate.recipe +++ b/recipes/project_syndicate.recipe @@ -61,9 +61,9 @@ class projectsynd(BasicNewsRecipe): def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'old-src':True}): img['src'] = img['old-src'].replace('medium', 'xlarge') - if abs := soup.find(attrs={'itemprop':'abstract'}): - abs.name = 'p' - abs['class'] = 'sub' + if abst := soup.find(attrs={'itemprop':'abstract'}).find('div'): + abst.name = 'p' + abst['class'] = 'sub' for div in soup.findAll('div', attrs={'data-line-id':True}): div.name = 'p' for a in soup.findAll('a', href=True):