From 791857d630d99c7b2ff4fb326d86d79e73d1bfa6 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 13 Apr 2024 09:57:15 +0530 Subject: [PATCH 1/3] Update Business Today --- recipes/business_today.recipe | 5 +- .../hindu_business_line_print_edition.recipe | 94 ------------------ .../hindu_business_line_print_edition.png | Bin 1577 -> 0 bytes 3 files changed, 3 insertions(+), 96 deletions(-) delete mode 100644 recipes/hindu_business_line_print_edition.recipe delete mode 100644 recipes/icons/hindu_business_line_print_edition.png diff --git a/recipes/business_today.recipe b/recipes/business_today.recipe index 44c5705ba3..1ea2f2b349 100644 --- a/recipes/business_today.recipe +++ b/recipes/business_today.recipe @@ -74,7 +74,7 @@ class BT(BasicNewsRecipe): # Insert feeds in specified order, if available - feedSort = ['Editor\'s Note'] + feedSort = ['Editor\'s Note', 'Editors note'] for i in feedSort: if i in sections: feeds.append((i, sections[i])) @@ -82,7 +82,8 @@ class BT(BasicNewsRecipe): # Done with the sorted feeds for i in feedSort: - del sections[i] + if i in sections: + del sections[i] # Append what is left over... diff --git a/recipes/hindu_business_line_print_edition.recipe b/recipes/hindu_business_line_print_edition.recipe deleted file mode 100644 index b0811a087c..0000000000 --- a/recipes/hindu_business_line_print_edition.recipe +++ /dev/null @@ -1,94 +0,0 @@ -import json -import re -from collections import defaultdict -from datetime import date - -from calibre.web.feeds.news import BasicNewsRecipe, classes - - -def absurl(url): - if url.startswith('/'): - url = 'https://www.thehindubusinessline.com' + url - return url - - -local_edition = None -# Chennai is default edition, for other editions use 'bl_hyderabad', 'bl_bangalore', 'bl_mumbai' - - -class BusinessLine(BasicNewsRecipe): - title = 'The Hindu BusinessLine | Print Edition' - __author__ = 'unkn0wn' - description = ( - 'The Hindu BusinessLine is known for its credibility, accuracy, in-depth analysis of markets and sober coverage' - ' of business news. BusinessLine reduces the daily grind of business to relevant, readable, byte-sized stories.' - ' The newspaper is extensively followed by the decision makers and change leaders from the world of business.' - ) - language = 'en_IN' - no_stylesheets = True - masthead_url = 'https://www.thehindubusinessline.com/theme/images/bl-online/bllogo.png' - remove_attributes = ['style', 'height', 'width'] - extra_css = '.caption{font-size:small; text-align:center;}'\ - '.author{font-size:small; font-weight:bold;}'\ - '.subhead, .subhead_lead {font-weight:bold;}'\ - 'img {display:block; margin:0 auto;}' - - ignore_duplicate_articles = {'url'} - - keep_only_tags = [ - classes('articlepage') - ] - - remove_tags = [ - classes('hide-mobile comments-shares share-page editiondetails author-img') - ] - - def preprocess_html(self, soup): - for cap in soup.findAll('p', attrs={'class':'caption'}): - cap.name = 'figcaption' - for img in soup.findAll('img', attrs={'data-original':True}): - img['src'] = img['data-original'] - return soup - - def parse_index(self): - dt = date.today().strftime('%Y-%m-%d') - # For past editions, set date to, for example, '2023-01-28' - # dt = '2023-01-28' - if local_edition: - url = absurl('/todays-paper/' + dt + '/' + local_edition + '/') - else: - url = absurl('/todays-paper/' + dt + '/bl_chennai/') - raw = self.index_to_soup(url, raw=True) - soup = self.index_to_soup(raw) - ans = self.hindu_parse_index(soup) - if not ans: - raise ValueError( - 'The Hindu BusinessLine Newspaper is not published Today.' - ) - cover = soup.find(attrs={'class':'hindu-ad'}) - if cover: - self.cover_url = cover.img['src'] - return ans - - def hindu_parse_index(self, soup): - for script in soup.findAll('script'): - if not self.tag_to_string(script).strip().startswith('let grouped_articles = {}'): - continue - if script is not None: - art = re.search(r'grouped_articles = ({\".*)', self.tag_to_string(script)) - data = json.JSONDecoder().raw_decode(art.group(1))[0] - - feeds_dict = defaultdict(list) - - a = json.dumps(data) - for sec in json.loads(a): - for item in data[sec]: - section = sec.replace('BL_', '') - title = item['articleheadline'] - url = absurl(item['href']) - desc = 'Page no.' + item['pageno'] + ' | ' + item['teaser_text'] or '' - self.log('\t', title, '\n\t\t', url) - feeds_dict[section].append({"title": title, "url": url, "description": desc}) - return [(section, articles) for section, articles in feeds_dict.items()] - else: - return [] diff --git a/recipes/icons/hindu_business_line_print_edition.png b/recipes/icons/hindu_business_line_print_edition.png deleted file mode 100644 index 94791ebf0db8be490ed749d82ec5ed65738a8105..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1577 zcmV+^2G;qBP)Q+SK~#8Nl~!GB6-5-Dnccg$xBt*uK&(M5sTz$zL@JGFG||Km2|gk5 z1qct~1C6l~HBk~@OiT>H7YUFMW8woQ0x_6^RIyk}gRxW~MX<1})4`?GuR{?3f& z?5t~lZUde>nR|BT?03$bpFKlUsg!No?{x0k_2EZGGAV>$gpzqkMiY_ftyw3%o#c&jVbJ1`|Sv%u6 z5xUYBO{jp9v7A$1TSWuTLvTv|KPgy+I_s2;$ z7j)F7bUcobcOfp2AYPR^Msd7%O!rixxVYZr>1Y2%nQow2+^nyd8&11 z*B4#a&i$1=_S@+Cz5TC!*1Pxnp`o-%smcg>C2e(G7<=l&ZWQh~v9J63rIEtztWAid zl$2bwBcU;kP>oR{r_rkSe0lo}ccQ`UsQu`iiQG#C)<`5X%)h zqN6f~Iw%k;E3K*;q=C8yMy(0Q_QaAQVnyj*FxuI=gVYKoi=jf%KkV zM>elqw4wdMw$?^Ew}vIccw9Px)HL)si^HkDKjlv#cxg|6|8RDW zizI+($~lqi@nnV`r)fD-`muFzx%#06;H4BSIl=yu*J9tq#7OM-2ri)!C^yRuW(K*u z9#r#qbFJ-5lGKLMrJNIlcF&MvRFPUh+mHS}I%#@J<-L+Yh{iByR4y;L)DVKx5f}~!@D99U(QLIYzOr^mF zZWTU1a`oBf8jWHx=_+JJkLOGNK*qK*4srOX$>RIjE|VjCK`nIBG@BclZw z2GB@oOBdI;UNACc-``OAOiM#cQx&|BD+Pdlb|3X4z?I0LXez| z@(E3gqpGqK24#w8DmzpG(eMMVGdl0JB?fYQ0qD`k6>|~ARo$YRkeO*zT#KN9l)ZRG zPwIY9^a4OeTM#$e(?T`$Utxp`CpF9H&MF{N^}fDyaCP(o8K!WgS_tfyMIVm`?VC|t zj1g#jiHi(DbSWa^1h4|Pk`;;GzP{-4yLIap+X+Ae7t##`paTL^h69655s4v&a$W3F z86R6BDf^~feit@2PGvI3yL;BGZO`V59IayDJB_KxL^ETNFJkN%7tKYIs@VxHl9kc* b9l-wqjSfwG7B2eD00000NkvXXu0mjfP7C9k From cd5be64891c3539f129e2e209e3ad6f5d5fe7c76 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 13 Apr 2024 11:11:13 +0530 Subject: [PATCH 2/3] Update new_yorker.recipe fix cover --- recipes/new_yorker.recipe | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 352e7f26c8..8629c79f2b 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -80,27 +80,21 @@ class NewYorker(BasicNewsRecipe): # return buf.getvalue() def parse_index(self): - # Get cover - cover_soup = self.index_to_soup('https://www.newyorker.com/archive') - cover_img = cover_soup.find( - attrs={'class': lambda x: x and 'MagazineSection__cover___' in x}) - if cover_img is not None: - cover_img = cover_img.find('img') - if cover_img is not None: - self.cover_url = cover_img.get('src') - try: - # the src original resolution w_280 was too low, replace w_280 with w_560 - cover_url_width_index = self.cover_url.find("w_") - old_width = self.cover_url[cover_url_width_index:cover_url_width_index+5] - self.cover_url = self.cover_url.replace(old_width, "w_560") - except Exception: - self.log('Failed enlarging cover img, using the original one') - self.log('Found cover:', self.cover_url) - - # Get content - soup = self.index_to_soup( 'https://www.newyorker.com/magazine?intcid=magazine') + cover_img = soup.find('picture', + attrs={'class': lambda x: x and 'asset-embed__responsive-asset' in x}) + if cover_img is not None: + self.cover_url = cover_img.img['src'] + self.log('Found cover:', self.cover_url) + try: + # the src original resolution w_280 was too low, replace w_280 with w_560 + cover_url_width_index = self.cover_url.find("w_") + old_width = self.cover_url[cover_url_width_index:cover_url_width_index+5] + self.cover_url = self.cover_url.replace(old_width, "w_640") + except Exception: + self.log('Failed enlarging cover img, using the original one') + feeds_dict = defaultdict(list) for section in soup.findAll('section', attrs={'class': lambda x: x and 'SummaryRiverSection-' in x}): From fae7481849fa93c3e70e38e63143dab008fd32c5 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 13 Apr 2024 11:14:15 +0530 Subject: [PATCH 3/3] Update Harpers Magazine add cover --- recipes/harpers.recipe | 5 +++++ recipes/harpers_full.recipe | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/recipes/harpers.recipe b/recipes/harpers.recipe index 059f4b5e08..6f0d8497d5 100644 --- a/recipes/harpers.recipe +++ b/recipes/harpers.recipe @@ -79,5 +79,10 @@ class Harpers(BasicNewsRecipe): .index-statement .index-tooltip { font-size: small; } """ + def get_cover_url(self): + issues_soup = self.index_to_soup("https://harpers.org/issues/") + curr_issue_a_ele = issues_soup.select_one("div.issue-card a") + if curr_issue_a_ele.find("img"): + return curr_issue_a_ele.img["src"] feeds = [(u"Harper's Magazine", u'https://harpers.org/feed/')] diff --git a/recipes/harpers_full.recipe b/recipes/harpers_full.recipe index 159419d623..bf8f8dae79 100644 --- a/recipes/harpers_full.recipe +++ b/recipes/harpers_full.recipe @@ -131,7 +131,8 @@ class Harpers_full(BasicNewsRecipe): if not _issue_url: issues_soup = self.index_to_soup("https://harpers.org/issues/") curr_issue_a_ele = issues_soup.select_one("div.issue-card a") - curr_issue_url = urljoin(self.base_url, curr_issue_a_ele["href"]) + if curr_issue_a_ele.find("img"): + self.cover_url = curr_issue_a_ele.img["src"] else: curr_issue_url = _issue_url