From 310df5e73501e1c3f41d1990b3aafd9cdb7119b4 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 21 May 2023 10:23:28 +0530 Subject: [PATCH] removing print editions of Business Standard and IE. . --- .../business_standard_print_edition.recipe | 92 ------------------ .../icons/business_standard_print_edition.png | Bin 1000 -> 0 bytes recipes/indian_express_print_edition.recipe | 87 ----------------- 3 files changed, 179 deletions(-) delete mode 100644 recipes/business_standard_print_edition.recipe delete mode 100644 recipes/icons/business_standard_print_edition.png delete mode 100644 recipes/indian_express_print_edition.recipe diff --git a/recipes/business_standard_print_edition.recipe b/recipes/business_standard_print_edition.recipe deleted file mode 100644 index 9158a49152..0000000000 --- a/recipes/business_standard_print_edition.recipe +++ /dev/null @@ -1,92 +0,0 @@ -''' -www.business-standard.com -''' - -from calibre.web.feeds.news import BasicNewsRecipe, classes - - -class BusinessStandard(BasicNewsRecipe): - title = 'Business Standard | Print Edition' - __author__ = 'unkn0wn' - description = "India's most respected business daily" - no_stylesheets = True - use_embedded_content = False - encoding = 'utf-8' - publisher = 'Business Standard Limited' - category = 'news, business, money, india, world' - language = 'en_IN' - extra_css = ''' - .article__desc{font-size:small;} - .article_image{font-size:small; font-style:italic;} - .article__dateline{font-size:small;} - .full-img{font-size:small; font-style:italic; text-align:center;} - .pubDate{font-size:small; text-align:center;} - ''' - - masthead_url = 'https://bsmedia.business-standard.com/include/_mod/site/html5/images/business-standard-logo.png' - - def get_cover_url(self): - soup = self.index_to_soup( - 'https://www.magzter.com/IN/Business-Standard-Private-Ltd/Business-Standard/Newspaper/' - ) - for citem in soup.findAll( - 'meta', content=lambda s: s and s.endswith('view/3.jpg') - ): - return citem['content'] - - remove_attributes = ['width', 'height', 'style'] - - keep_only_tags = [ - classes( - 'article__title article__content article_content article_image article__dateline headline' - ' alternativeHeadline full-img article-content__img pubDate' - ), - dict(name='section', attrs={'subscriptions-section': 'content'}), - dict(name='span', attrs={'class': 'p-content'}) - ] - remove_tags = [ - classes('also-read-panel related-keyword more-stories-pagination'), - dict(name='br') - ] - - def parse_index(self): - soup = self.index_to_soup('https://www.business-standard.com/todays-paper') - ans = self.bs_parse_index(soup) - return ans - - def bs_parse_index(self, soup): - feeds = [] - div = soup.find('div', attrs={'class': 'main-cont-left'}) - for section in div.findAll('div', attrs={'class': 'row-inner'}): - h2 = section.find('h2') - secname = self.tag_to_string(h2) - self.log(secname) - articles = [] - for a in section.findAll( - 'a', href=lambda x: x and x.startswith('/article/') - ): - url = a['href'].replace('article', 'article-amp') - url = 'https://wap.business-standard.com' + url - title = self.tag_to_string(a).strip().replace('Premium Content', '') - articles.append({'title': title, 'url': url}) - self.log('\t', title, '\n\t\t', url) - if articles: - feeds.append((secname, articles)) - return feeds - - def preprocess_html(self, soup): - subs = soup.find('section', attrs={'subscriptions-section': 'content'}) - if subs: - art = soup.find(**classes('article_image')) - if art: - art.extract() - div = soup.find(**classes('article_content')) - if div: - div.extract() - h2 = soup.find('h2') - if h2: - h2.name = 'h4' - for img in soup.findAll('amp-img', src=True): - img.name = 'img' - img['src'] = img['src'].replace('\\', '').split('?')[0] - return soup diff --git a/recipes/icons/business_standard_print_edition.png b/recipes/icons/business_standard_print_edition.png deleted file mode 100644 index 83a1b55c065330fd087d9300071ef505be4970ee..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1000 zcmV>P)+^deIOYb2k!q^?H^5X4YRNouaT?+X%8eM#!ZfBc4{=<;U^Wr?J={(9!3H0>$Vd(2?xPI+A(y0vo8XUo#;TTQJSK{u~YmkgWBm;F3 zG)QASfKo>%&WF9|$jnA%d<>T^Uq{obb?EMI$N7t$`0eryB(`kFeOtF+$(uW&){3e+ z^&lUkJfnsymk!{<%BRs%fr~vk^vzn1r3a58X%a>oaCFzh2=*OA=ge-*+j|(lWQNrgX=e<=h>Y&_v-6N6+g$){eR*{m_=7yz|YUELh;0D+~{9` z)rXFw)LEY%1jR-Opsmz__HsK)4?TyDVAh(|7!H%@d;cq(+_n{e ze0mTIK0AQ2ore(xnbm8MQ&~lL)8)dUA%>0_glD?YQduCH(}tl zUHEQj5G{ovm`K(|kd#8PSV1n+iko2uVKRsAN)^8zJb~s)3wl?qK;`&HIDP0H^geJe zTJP+^_#H8Bbv2VGQ#It`5Vffoi7?>yX%~f1 zO+vB)VO$?UDIUgXwGmyV3{)X%0#O;E^nV|2XH>&TwSZFCj5d%N%|kAY;dG`Rf`0&} WL0{*r$FV2?0000pI diff --git a/recipes/indian_express_print_edition.recipe b/recipes/indian_express_print_edition.recipe deleted file mode 100644 index cc5bbe74a4..0000000000 --- a/recipes/indian_express_print_edition.recipe +++ /dev/null @@ -1,87 +0,0 @@ -from calibre.web.feeds.news import BasicNewsRecipe, classes -from collections import defaultdict - - -class IndianExpressPrint(BasicNewsRecipe): - title = u'Indian Express | Print Edition' - language = 'en_IN' - __author__ = 'unkn0wn' - masthead_url = 'https://indianexpress.com/wp-content/themes/indianexpress/images/indian-express-logo-n.svg' - no_stylesheets = True - use_embedded_content = False - remove_attributes = ['style', 'height', 'width'] - ignore_duplicate_articles = {'url'} - - extra_css = ''' - #storycenterbyline {font-size:small;} - #img-cap {font-size:small;} - blockquote{color:#404040;} - em{font-style:italic; color:#202020;} - #sub-d{color:#202020; font-style:italic;} - .ie-authorbox{font-size:small;} - ''' - - resolve_internal_links = True - remove_empty_feeds = True - - keep_only_tags = [classes('heading-part full-details')] - remove_tags = [ - dict(name='div', attrs={'id': 'ie_story_comments'}), - dict(name='div', attrs={'class': lambda x: x and 'related-widget' in x}), - dict(name='img', attrs={'src':lambda x: x and x.endswith('-button-300-ie.jpeg')}), - dict(name='a', attrs={'href':lambda x: x and x.endswith('/?utm_source=newbanner')}), - classes( - 'share-social appstext ie-int-campign-ad ie-breadcrumb custom_read_button unitimg copyright' - ' storytags pdsc-related-modify news-guard premium-story append_social_share' - ' digital-subscriber-only h-text-widget ie-premium ie-first-publish adboxtop adsizes immigrationimg' - 'next-story-wrap ie-ie-share next-story-box brand-logo quote_section ie-customshare' - ' custom-share o-story-paper-quite ie-network-commenting audio-player-tts-sec' - ) - ] - - def parse_index(self): - soup = self.index_to_soup('https://indianexpress.com/todays-paper/') - feeds_dict = defaultdict(list) - div = soup.find('div', attrs={'class':'today-paper'}) - for a in div.findAll('a', attrs={'href':lambda x: x and x.startswith( - ('https://indianexpress.com/article/', 'https://indianexpress.com/elections/') - )}): - if not a.find('img'): - url = a['href'] - title = self.tag_to_string(a) - section = 'Front Page' - if str := a.findParent('strong'): - if span := str.find_previous_sibling('span'): - section = self.tag_to_string(span) - # if 'City' in section: - # url = '' - if not url or not title: - continue - self.log(section, '\n\t', title, '\n\t\t', url) - feeds_dict[section].append({"title": title, "url": url}) - return [(section, articles) for section, articles in feeds_dict.items()] - - def get_cover_url(self): - soup = self.index_to_soup( - 'https://www.magzter.com/IN/The-Indian-Express-Ltd./The-Indian-Express-Mumbai/Newspaper/' - ) - for citem in soup.findAll('meta', content=lambda s: s and s.endswith('view/3.jpg')): - return citem['content'] - - def preprocess_html(self, soup): - h2 = soup.find('h2') - if h2: - h2.name = 'p' - h2['id'] = 'sub-d' - for span in soup.findAll( - 'span', attrs={'class': ['ie-custom-caption', 'custom-caption']} - ): - span['id'] = 'img-cap' - for img in soup.findAll('img'): - noscript = img.findParent('noscript') - if noscript is not None: - lazy = noscript.findPreviousSibling('img') - if lazy is not None: - lazy.extract() - noscript.name = 'div' - return soup