From 3fe8bfd89a8fbe8c961e309fc661830d579e957b Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 27 Jul 2024 13:19:55 +0530 Subject: [PATCH 1/2] update sportstar --- recipes/open_magazine.recipe | 26 +++++++++++++++++++++----- recipes/reuters.recipe | 3 ++- recipes/sportstar.recipe | 22 ++++++++++++++++++---- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/recipes/open_magazine.recipe b/recipes/open_magazine.recipe index be8c04930b..4d7aaa2952 100644 --- a/recipes/open_magazine.recipe +++ b/recipes/open_magazine.recipe @@ -20,12 +20,28 @@ class OpenMagazine(BasicNewsRecipe): 'blockquote{color:#404040;}' '.about-author{font-size:small;}' + recipe_specific_options = { + 'days': { + 'short': 'Oldest article to download from this news source. In days ', + 'long': 'For example, 0.5, gives you articles from the past 12 hours', + 'default': str(oldest_article) + } + } + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + d = self.recipe_specific_options.get('days') + if d and isinstance(d, str): + self.oldest_article = float(d) + def get_cover_url(self): - soup = self.index_to_soup('https://openthemagazine.com/') - tag = soup.find(attrs={'class': 'magazine-item mr-1'}) - if tag: - self.cover_url = tag.find('img')['src'] - return getattr(self, 'cover_url', None) + d = self.recipe_specific_options.get('days') + if not (d and isinstance(d, str)): + soup = self.index_to_soup('https://openthemagazine.com/') + tag = soup.find(attrs={'class': 'magazine-item mr-1'}) + if tag: + self.cover_url = tag.find('img')['src'] + return getattr(self, 'cover_url', None) keep_only_tags = [ classes('post-data post-thumb post-meta post-excerp'), diff --git a/recipes/reuters.recipe b/recipes/reuters.recipe index 7fdbbbb997..a9abcc5416 100644 --- a/recipes/reuters.recipe +++ b/recipes/reuters.recipe @@ -20,7 +20,8 @@ class Reuters(BasicNewsRecipe): 'reaching billions of people worldwide every day. Reuters provides business, financial, national and international ' 'news to professionals via desktop terminals, the world’s media organizations, industry events and directly to consumers.' ) - masthead_url = 'https://www.reutersprofessional.com/wp-content/uploads/2024/03/primary-logo.svg' + masthead_url = 'https://www.reutersagency.com/wp-content/uploads/2024/06/reuters-logo.png' + cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024' language = 'en' encoding = 'utf-8' oldest_article = 1.2 # days diff --git a/recipes/sportstar.recipe b/recipes/sportstar.recipe index bb7c49091e..90ae0cd0ed 100644 --- a/recipes/sportstar.recipe +++ b/recipes/sportstar.recipe @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 from collections import OrderedDict from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -25,6 +27,13 @@ class Sportstar(BasicNewsRecipe): .author, .publish-time {font-size:small;} ''' + recipe_specific_options = { + 'issue': { + 'short': 'Enter the Issue Number you want to download\n(Volume-Issue format)', + 'long': 'For example, 47-16' + } + } + keep_only_tags = [ dict(name='h1', attrs={'class':'title'}), dict(name='h2', attrs={'class':'sub-title'}), @@ -39,10 +48,15 @@ class Sportstar(BasicNewsRecipe): ] def parse_index(self): - soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/') - url = soup.find('a', href=lambda x: x and x.startswith('https://sportstar.thehindu.com/magazine/issue/'))['href'] - self.log('Downloading Issue: ', url) - soup = self.index_to_soup(url) + d = self.recipe_specific_options.get('issue') + if d and isinstance(d, str): + issue_url = 'https://sportstar.thehindu.com/magazine/issue/vol' + d + else: + soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/') + issue_url = soup.find('a', href=lambda x: x and x.startswith('https://sportstar.thehindu.com/magazine/issue/'))['href'] + self.log('Downloading Issue: ', issue_url) + + soup = self.index_to_soup(issue_url) feeds = OrderedDict() From 2025f05a1bcb2044d4cb411ef033d581fed3ee6e Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sat, 27 Jul 2024 13:21:43 +0530 Subject: [PATCH 2/2] Update hbr.recipe --- recipes/hbr.recipe | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe index 8f3081c48b..e3a441d814 100644 --- a/recipes/hbr.recipe +++ b/recipes/hbr.recipe @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# vim:fileencoding=utf-8 import json import re from collections import OrderedDict @@ -7,8 +9,6 @@ from calibre import browser, random_user_agent from calibre.web.feeds.news import BasicNewsRecipe, classes from mechanize import Request -_issue_url = "" # custom issue url - class HBR(BasicNewsRecipe): title = "Harvard Business Review" @@ -129,15 +129,23 @@ class HBR(BasicNewsRecipe): content_ele.append(new_soup.body) return str(soup) + recipe_specific_options = { + 'issue': { + 'short': 'Enter the Issue Number you want to download ', + 'long': 'For example, 2403' + } + } + def parse_index(self): - if not _issue_url: + d = self.recipe_specific_options.get('issue') + if not (d and isinstance(d, str)): soup = self.index_to_soup(f"{self.base_url}/magazine") a = soup.find("a", href=lambda x: x and x.startswith("/archive-toc/")) cov_url = a.find("img", attrs={"src": True})["src"] self.cover_url = urljoin(self.base_url, cov_url) issue_url = urljoin(self.base_url, a["href"]) else: - issue_url = _issue_url + issue_url = 'https://hbr.org/archive-toc/BR' + d mobj = re.search(r"archive-toc/(?P(BR)?\d+)\b", issue_url) if mobj: self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png'