This commit is contained in:
Kovid Goyal 2024-07-27 13:25:16 +05:30
commit f3420c6b15
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 53 additions and 14 deletions

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
import json import json
import re import re
from collections import OrderedDict from collections import OrderedDict
@ -7,8 +9,6 @@ from calibre import browser, random_user_agent
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
from mechanize import Request from mechanize import Request
_issue_url = "" # custom issue url
class HBR(BasicNewsRecipe): class HBR(BasicNewsRecipe):
title = "Harvard Business Review" title = "Harvard Business Review"
@ -129,15 +129,23 @@ class HBR(BasicNewsRecipe):
content_ele.append(new_soup.body) content_ele.append(new_soup.body)
return str(soup) return str(soup)
recipe_specific_options = {
'issue': {
'short': 'Enter the Issue Number you want to download ',
'long': 'For example, 2403'
}
}
def parse_index(self): def parse_index(self):
if not _issue_url: d = self.recipe_specific_options.get('issue')
if not (d and isinstance(d, str)):
soup = self.index_to_soup(f"{self.base_url}/magazine") soup = self.index_to_soup(f"{self.base_url}/magazine")
a = soup.find("a", href=lambda x: x and x.startswith("/archive-toc/")) a = soup.find("a", href=lambda x: x and x.startswith("/archive-toc/"))
cov_url = a.find("img", attrs={"src": True})["src"] cov_url = a.find("img", attrs={"src": True})["src"]
self.cover_url = urljoin(self.base_url, cov_url) self.cover_url = urljoin(self.base_url, cov_url)
issue_url = urljoin(self.base_url, a["href"]) issue_url = urljoin(self.base_url, a["href"])
else: else:
issue_url = _issue_url issue_url = 'https://hbr.org/archive-toc/BR' + d
mobj = re.search(r"archive-toc/(?P<issue>(BR)?\d+)\b", issue_url) mobj = re.search(r"archive-toc/(?P<issue>(BR)?\d+)\b", issue_url)
if mobj: if mobj:
self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png' self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png'

View File

@ -20,12 +20,28 @@ class OpenMagazine(BasicNewsRecipe):
'blockquote{color:#404040;}' 'blockquote{color:#404040;}'
'.about-author{font-size:small;}' '.about-author{font-size:small;}'
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('https://openthemagazine.com/') d = self.recipe_specific_options.get('days')
tag = soup.find(attrs={'class': 'magazine-item mr-1'}) if not (d and isinstance(d, str)):
if tag: soup = self.index_to_soup('https://openthemagazine.com/')
self.cover_url = tag.find('img')['src'] tag = soup.find(attrs={'class': 'magazine-item mr-1'})
return getattr(self, 'cover_url', None) if tag:
self.cover_url = tag.find('img')['src']
return getattr(self, 'cover_url', None)
keep_only_tags = [ keep_only_tags = [
classes('post-data post-thumb post-meta post-excerp'), classes('post-data post-thumb post-meta post-excerp'),

View File

@ -20,7 +20,8 @@ class Reuters(BasicNewsRecipe):
'reaching billions of people worldwide every day. Reuters provides business, financial, national and international ' 'reaching billions of people worldwide every day. Reuters provides business, financial, national and international '
'news to professionals via desktop terminals, the worlds media organizations, industry events and directly to consumers.' 'news to professionals via desktop terminals, the worlds media organizations, industry events and directly to consumers.'
) )
masthead_url = 'https://www.reutersprofessional.com/wp-content/uploads/2024/03/primary-logo.svg' masthead_url = 'https://www.reutersagency.com/wp-content/uploads/2024/06/reuters-logo.png'
cover_url = 'https://yt3.googleusercontent.com/ytc/AIdro_mk43b9eQwN15ZBDyMPDaElxvw4V-oUS9XDUvVnYB3gA9yA=s1024'
language = 'en' language = 'en'
encoding = 'utf-8' encoding = 'utf-8'
oldest_article = 1.2 # days oldest_article = 1.2 # days

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from collections import OrderedDict from collections import OrderedDict
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes
@ -25,6 +27,13 @@ class Sportstar(BasicNewsRecipe):
.author, .publish-time {font-size:small;} .author, .publish-time {font-size:small;}
''' '''
recipe_specific_options = {
'issue': {
'short': 'Enter the Issue Number you want to download\n(Volume-Issue format)',
'long': 'For example, 47-16'
}
}
keep_only_tags = [ keep_only_tags = [
dict(name='h1', attrs={'class':'title'}), dict(name='h1', attrs={'class':'title'}),
dict(name='h2', attrs={'class':'sub-title'}), dict(name='h2', attrs={'class':'sub-title'}),
@ -39,10 +48,15 @@ class Sportstar(BasicNewsRecipe):
] ]
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/') d = self.recipe_specific_options.get('issue')
url = soup.find('a', href=lambda x: x and x.startswith('https://sportstar.thehindu.com/magazine/issue/'))['href'] if d and isinstance(d, str):
self.log('Downloading Issue: ', url) issue_url = 'https://sportstar.thehindu.com/magazine/issue/vol' + d
soup = self.index_to_soup(url) else:
soup = self.index_to_soup('https://sportstar.thehindu.com/magazine/')
issue_url = soup.find('a', href=lambda x: x and x.startswith('https://sportstar.thehindu.com/magazine/issue/'))['href']
self.log('Downloading Issue: ', issue_url)
soup = self.index_to_soup(issue_url)
feeds = OrderedDict() feeds = OrderedDict()