From 15dd52d0f1a75533839f4e9443679a3107dc4942 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:53:44 +0530 Subject: [PATCH 1/3] Update the_week_magazine_free.recipe --- recipes/the_week_magazine_free.recipe | 105 +++++++++++++++++++++----- 1 file changed, 86 insertions(+), 19 deletions(-) diff --git a/recipes/the_week_magazine_free.recipe b/recipes/the_week_magazine_free.recipe index 923cc239c5..c424635e93 100644 --- a/recipes/the_week_magazine_free.recipe +++ b/recipes/the_week_magazine_free.recipe @@ -1,27 +1,94 @@ -__license__ = 'GPL v3' -__copyright__ = '2010, JOlo' ''' www.theweek.com ''' - -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes +from urllib.parse import quote class TheWeek(BasicNewsRecipe): - title = 'TheWeek.com' - __author__ = 'Jim Olo' - description = "The best of the US and international media. Daily coverage of commentary and analysis of the day's events, as well as arts, entertainment, people and gossip, and political cartoons." # noqa - publisher = 'The Week Publications, Inc.' - masthead_url = 'http://test.theweek.com/images/logo_theweek.gif' - cover_url = masthead_url - category = 'news, politics, USA' - oldest_article = 7 - max_articles_per_feed = 100 - no_stylesheets = True + title = 'The Week' + __author__ = 'unkn0wn' + description = ( + 'The Week is for readers who want to know what\'s going on in the world, without having to read ' + 'several daily newspapers or get wrapped up in the endless news cycle. For every important story, ' + 'our editors carefully select commentary from all sides of the debate and artfully stitch them together ' + 'into one concise read. By showing you every perspective, we enable you to form your own opinion.' + ) + language = 'en_US' encoding = 'utf-8' - use_embedded_content = False - language = 'en' - auto_cleanup = True - feeds = [ - (u'Latest articles', u'http://theweek.com/rss.xml'), + no_stylesheets = True + remove_javascript = True + remove_attributes = ['width', 'height', 'style'] + + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True + resolve_internal_links = True + simultaneous_downloads = 1 + web_url = '' + + extra_css = ''' + img {display:block; margin:0 auto;} + .caption__text--hero, .credit { font-size:small; text-align:center; } + .header__strapline, em, i { color:#202020; } + .article-type__breadcrumb { color:grey; } + .author-byline__author-text {font-size:small; } + ''' + + def get_cover_url(self): + import json + url = 'https://usmagazine.theweek.com/timelines.json' + data = json.loads(self.index_to_soup(url, raw=True)) + for x in data['timelines'][:5]: + if '-cover-' in x['image']: + return 'https://usmagazine.theweek.com' + x['image'][1:] + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links ', link) + self.web_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + keep_only_tags = [ + classes('article-type__breadcrumb header__title header__strapline image image--hero author-byline__author-text article__body') ] + + remove_tags = [ + dict(name='aside'), + classes( + 'blueconic-article__wrapper ad-unit van_vid_carousel tag-links' + ) + ] + + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-pin-media':True}): + img['src'] = img['data-pin-media'].replace('.jpg', '-768-80.jpg') + return soup + + feeds = [] + when = '168' # hours (7 days) + index = 'https://theweek.com/' + sections = [ + 'politics', 'news', 'cartoons', 'tech', 'science', 'health', + 'culture-life', 'business', 'travel', 'arts-life', 'history' + ] + for sec in sections: + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=US&ceid=US:en' + feeds.append((sec.capitalize(), a.format(when, quote(index + sec, safe='')))) + feeds.append(('Others', a.format(when, quote(index, safe=''), ''))) + + def populate_article_metadata(self, article, soup, first): + article.title = article.title.replace(' - The Week', '') + desc = soup.find(**classes('header__strapline')) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary + article.url = self.web_url From 215513510fc15fa24c87a66fdfa31e8985febab6 Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Wed, 28 Feb 2024 10:08:58 +0530 Subject: [PATCH 2/3] ... --- recipes/moneycontrol.recipe | 13 +++++++------ recipes/the_week_magazine_free.recipe | 3 ++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/recipes/moneycontrol.recipe b/recipes/moneycontrol.recipe index 2563b5cf50..4fcc5c5760 100644 --- a/recipes/moneycontrol.recipe +++ b/recipes/moneycontrol.recipe @@ -16,6 +16,7 @@ class MoneyControlRecipe(BasicNewsRecipe): ignore_duplicate_articles = {'title', 'url'} remove_empty_feeds = True resolve_internal_links = True + oldest_article = 1 # days extra_css = ''' img {display:block; margin:0 auto;} @@ -65,7 +66,7 @@ class MoneyControlRecipe(BasicNewsRecipe): feeds = [] - when = 27 # hours + when = oldest_article*24 index = 'https://www.moneycontrol.com/' business_sections = [ @@ -73,12 +74,12 @@ class MoneyControlRecipe(BasicNewsRecipe): 'personal-finance', 'commodities', 'trade', 'companies' ] - a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}{}&hl=en-IN&gl=IN&ceid=IN:en' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en' for sec in business_sections: allinurl_a = index + 'news/business' - feeds.append((sec.capitalize(), a.format(when, quote(allinurl_a, safe=''), '%2F' + sec + '%2F'))) - feeds.append(('Business' , a.format(when, quote(allinurl_a, safe=''), ''))) + feeds.append((sec.capitalize(), a.format(when, quote(allinurl_a + sec, safe='')))) + feeds.append(('Business' , a.format(when, quote(allinurl_a + sec, safe='')))) news_sections = [ 'india', 'world', 'opinion', 'politics', 'technology', 'trends', 'lifestyle' @@ -86,8 +87,8 @@ class MoneyControlRecipe(BasicNewsRecipe): for sec in news_sections: allinurl_b = index + 'news' - feeds.append((sec.capitalize(), a.format(when, quote(allinurl_b, safe=''), '%2F' + sec + '%2F'))) - feeds.append(('News', a.format(when, quote(allinurl_b, safe=''), ''))) + feeds.append((sec.capitalize(), a.format(when, quote(allinurl_b + sec, safe='')))) + feeds.append(('News', a.format(when, quote(allinurl_b + sec, safe=''), ''))) feeds.append( ('Others', 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=IN&ceid=IN:en'.format(when, quote(index, safe=''))) ) diff --git a/recipes/the_week_magazine_free.recipe b/recipes/the_week_magazine_free.recipe index c424635e93..db4a44acaa 100644 --- a/recipes/the_week_magazine_free.recipe +++ b/recipes/the_week_magazine_free.recipe @@ -24,6 +24,7 @@ class TheWeek(BasicNewsRecipe): remove_empty_feeds = True resolve_internal_links = True simultaneous_downloads = 1 + oldest_article = 7 # days web_url = '' extra_css = ''' @@ -74,7 +75,7 @@ class TheWeek(BasicNewsRecipe): return soup feeds = [] - when = '168' # hours (7 days) + when = oldest_article*24 index = 'https://theweek.com/' sections = [ 'politics', 'news', 'cartoons', 'tech', 'science', 'health', From 745409c1967db16028950d45a32d3992d5700c7c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Wed, 28 Feb 2024 10:34:03 +0530 Subject: [PATCH 3/3] The Week UK --- recipes/icons/the_week_magazine_free.png | Bin 157 -> 286 bytes recipes/icons/the_week_uk.png | Bin 0 -> 286 bytes recipes/the_week_uk.recipe | 95 +++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 recipes/icons/the_week_uk.png create mode 100644 recipes/the_week_uk.recipe diff --git a/recipes/icons/the_week_magazine_free.png b/recipes/icons/the_week_magazine_free.png index 4fc029a27ff442c2159df3fa8d32eee043a2b876..f8d3c9013f4c04491ea6bd067e00f8c5dfe8c75c 100644 GIT binary patch literal 286 zcmV+(0pb3MP)FJ7! zion3YCMG7Qr>9IzOc@y&U|?WTQBieub@1@;3=9mhv9UcpJzHB_ZEbDE#l_<*G=U{< zUH||9a7jc#R2Y?wk4X-LFc1Wr;2pEXHamob|34Dsh%ra%)}_IHlkl*=qq klGtC7okW#Kswa;E2X|itJ)?-$h5!Hn07*qoM6N<$g0Y@}rT_o{ literal 157 zcmeAS@N?(olHy`uVBq!ia0vp^HbAV)2qYM8@tQsaQYoG;jv*Ddk`J(a`+x8OL#!kl z4<8>Z?;m!Fo}WT;e`+dzFf?1%A8fzRFVA%KLw$o`U5JSNABHrMDTgJV2^^CbF4b^$ zl|S|1al664gOWv`1d9KiR+doMog?x8_HE`|d5QhW3=Geoe>rvNK%+9qUQbs)mvv4F FO#l(AIFkSX diff --git a/recipes/icons/the_week_uk.png b/recipes/icons/the_week_uk.png new file mode 100644 index 0000000000000000000000000000000000000000..f8d3c9013f4c04491ea6bd067e00f8c5dfe8c75c GIT binary patch literal 286 zcmV+(0pb3MP)FJ7! zion3YCMG7Qr>9IzOc@y&U|?WTQBieub@1@;3=9mhv9UcpJzHB_ZEbDE#l_<*G=U{< zUH||9a7jc#R2Y?wk4X-LFc1Wr;2pEXHamob|34Dsh%ra%)}_IHlkl*=qq klGtC7okW#Kswa;E2X|itJ)?-$h5!Hn07*qoM6N<$g0Y@}rT_o{ literal 0 HcmV?d00001 diff --git a/recipes/the_week_uk.recipe b/recipes/the_week_uk.recipe new file mode 100644 index 0000000000..b3a0cb58e9 --- /dev/null +++ b/recipes/the_week_uk.recipe @@ -0,0 +1,95 @@ +''' +www.theweek.com +''' +from calibre.web.feeds.news import BasicNewsRecipe, classes +from urllib.parse import quote + + +class TheWeek(BasicNewsRecipe): + title = 'The Week' + __author__ = 'unkn0wn' + description = ( + 'The Week is for readers who want to know what\'s going on in the world, without having to read ' + 'several daily newspapers or get wrapped up in the endless news cycle. For every important story, ' + 'our editors carefully select commentary from all sides of the debate and artfully stitch them together ' + 'into one concise read. By showing you every perspective, we enable you to form your own opinion.' + ) + language = 'en_UK' + encoding = 'utf-8' + no_stylesheets = True + remove_javascript = True + remove_attributes = ['width', 'height', 'style'] + + ignore_duplicate_articles = {'title', 'url'} + remove_empty_feeds = True + resolve_internal_links = True + simultaneous_downloads = 1 + oldest_article = 7 # days + web_url = '' + + extra_css = ''' + img {display:block; margin:0 auto;} + .caption__text--hero, .credit { font-size:small; text-align:center; } + .header__strapline, em, i { color:#202020; } + .article-type__breadcrumb { color:grey; } + .author-byline__author-text {font-size:small; } + ''' + + def get_cover_url(self): + import json + url = 'https://ukmagazine.theweek.com/timelines.json' + data = json.loads(self.index_to_soup(url, raw=True)) + for x in data['timelines'][:5]: + if '-cover-' in x['image']: + return 'https://ukmagazine.theweek.com' + x['image'][1:] + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links ', link) + self.web_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + keep_only_tags = [ + classes('article-type__breadcrumb header__title header__strapline image image--hero author-byline__author-text article__body') + ] + + remove_tags = [ + dict(name='aside'), + classes( + 'blueconic-article__wrapper ad-unit van_vid_carousel tag-links' + ) + ] + + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-pin-media':True}): + img['src'] = img['data-pin-media'].replace('.jpg', '-768-80.jpg') + return soup + + feeds = [] + when = oldest_article*24 + index = 'https://theweek.com/' + sections = [ + 'politics', 'news', 'cartoons', 'tech', 'science', 'health', + 'culture-life', 'business', 'travel', 'arts-life', 'history' + ] + for sec in sections: + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=en-IN&gl=US&ceid=US:en' + feeds.append((sec.capitalize(), a.format(when, quote(index + sec, safe='')))) + feeds.append(('Others', a.format(when, quote(index, safe=''), ''))) + + def populate_article_metadata(self, article, soup, first): + article.title = article.title.replace(' - The Week', '') + desc = soup.find(**classes('header__strapline')) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary + article.url = self.web_url