From 0841f08d2214625f8ce9f001ca396e652d483337 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 14 Oct 2023 11:05:10 +0530 Subject: [PATCH] Newslaundry recipe --- recipes/icons/newslaundry.png | Bin 0 -> 286 bytes recipes/newslaundry.recipe | 36 ++++++++++++++++++++++++++++++++++ recipes/reuters.recipe | 29 +++++++++++++++------------ 3 files changed, 53 insertions(+), 12 deletions(-) create mode 100644 recipes/icons/newslaundry.png create mode 100644 recipes/newslaundry.recipe diff --git a/recipes/icons/newslaundry.png b/recipes/icons/newslaundry.png new file mode 100644 index 0000000000000000000000000000000000000000..342e059d0d300fea9fd81ebd87d0a2ac48038230 GIT binary patch literal 286 zcmV+(0pb3MP)n1G! z|NrU$0O}YZ>JJ&~A}Hz<9rm@h>HrV;#Kr#k`Rg4d^O&0JC@%AnlkaJ4?`3H8qNDPM ziS(eN_q@IO*VyU+3H|Nu?p$B_(9!aWjQZBs{Nv;M+}!!f%k4@|@qmNN;xy9$004GL zL_t&-l~vA55`!QVL{XZrDCCJz$w!d=uhNtgUV7AZMpuY?<~PBn9yABVz@}wMmVtKZ z=`;twu61BAY}FSId!|?38OZTnM)?PyFBllJ9i_4|bHY2ET_%E*psANPgof7JZ9qN{ kdb@U@8N59TrDDAA1(DMP_q!vt4FCWD07*qoM6N<$f}D?q`v3p{ literal 0 HcmV?d00001 diff --git a/recipes/newslaundry.recipe b/recipes/newslaundry.recipe new file mode 100644 index 0000000000..7b105a59ce --- /dev/null +++ b/recipes/newslaundry.recipe @@ -0,0 +1,36 @@ +from calibre.web.feeds.news import BasicNewsRecipe, classes + +class newslaundry(BasicNewsRecipe): + title = 'Newslaundry' + __author__ = 'unkn0wn' + description = ( + 'Newslaundry is a reader-supported, independent news media company. In an industry driven by corporate' + ' and government interests, we strongly believe in the need for an independent news model, and a free' + ' and accountable press.' + ) + language = 'en_IN' + masthead_url = 'https://images.assettype.com/newslaundry/2020-01/d91cad07-9650-47e9-8bdc-9a6247354d95/Header_logo_NL__2_New.png' + encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + oldest_article = 7 # days + resolve_internal_links = True + + ignore_duplicate_articles = {'url'} + + # keep_only_tags = [classes('headline subheadline authorWithTimeStamp story-card')] + + feeds = [ + ('Articles', 'https://www.newslaundry.com/stories.rss?time-period=last-7-days') + ] + + # def preprocess_html(self, soup): + # if h1 := soup.find(**classes('headline')): + # h1.name = 'h1' + # if h3 := soup.find(**classes('subheadline')): + # h3.name = 'h3' + # return soup + + def print_version(self, url): + if 'hindi.newslaundry' in url: self.abort_article('Skipping hindi article') # remove this line if you want hindi articles. + return url diff --git a/recipes/reuters.recipe b/recipes/reuters.recipe index f9d0e40f3a..92ddb4c0dd 100644 --- a/recipes/reuters.recipe +++ b/recipes/reuters.recipe @@ -58,27 +58,29 @@ class Reuters(BasicNewsRecipe): __author__ = 'Kovid Goyal' language = 'en' + keep_only_tags = [ prefixed_classes('article-body__container__ article-header__container__'), ] remove_tags = [ prefixed_classes( - 'context-widget__tabs___' + 'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__' ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___' - ' SocialEmbed__inner___' + ' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__' ), - dict(name=['button', 'link']), + dict(name=['button', 'link', 'svg']), ] - remove_attributes = ['style'] - extra_css = ''' - img { max-width: 100%; } - ''' + remove_attributes = ['style', 'height', 'width'] - def preprocess_html(self, soup, *a): - for noscript in soup.findAll('noscript'): - if noscript.findAll('img'): - noscript.name = 'div' - return soup + extra_css = ''' + img { max-width: 100%; } + [class^="article-header__tags__"], + [class^="author-bio__author-card__"], + [class^="article-header__author-date__"] { + font-size:small; + } + [data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; } + ''' def parse_index(self): base, sections = country_defs[country] @@ -103,6 +105,9 @@ class Reuters(BasicNewsRecipe): self.log('\t', article['title'], article['url']) def preprocess_html(self, soup): + for noscript in soup.findAll('noscript'): + if noscript.findAll('img'): + noscript.name = 'div' for img in soup.findAll('img', attrs={'srcset':True}): img['src'] = img['srcset'].split()[0] return soup