This commit is contained in:
Kovid Goyal 2024-07-23 17:38:04 +05:30
commit ab9cb22eac
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 91 additions and 15 deletions

View File

@ -65,9 +65,20 @@ class MitTechnologyReview(BasicNewsRecipe):
), ),
] ]
recipe_specific_options = {
'issue_url': {
'short': 'The issue URL ',
'long': 'For example, https://www.technologyreview.com/magazines/the-education-issue/',
'default': 'http://www.technologyreview.com/magazine/'
}
}
def parse_index(self): def parse_index(self):
# for past editions, change the issue link below # for past editions, change the issue link below
issue = 'http://www.technologyreview.com/magazine/' issue = 'http://www.technologyreview.com/magazine/'
d = self.recipe_specific_options.get('issue_url')
if d and isinstance(d, str):
issue = d
soup = self.index_to_soup(issue) soup = self.index_to_soup(issue)
if script := soup.find('script', id='preload'): if script := soup.find('script', id='preload'):
raw = script.contents[0] raw = script.contents[0]

View File

@ -121,9 +121,19 @@ class NewYorkTimes(BasicNewsRecipe):
tf.write(self.get_nyt_page(url)) tf.write(self.get_nyt_page(url))
return tf.name return tf.name
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYY/MM/DD format)',
'long': 'For example, 2024/07/16'
}
}
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' INDEX = 'https://www.nytimes.com/section/todayspaper'
# INDEX = 'file:///t/raw.html' # INDEX = 'file:///t/raw.html'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True)) return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
def read_nyt_metadata(self): def read_nyt_metadata(self):

View File

@ -24,6 +24,20 @@ class AdvancedUserRecipe1277129332(BasicNewsRecipe):
conversion_options = {'linearize_tables': True} conversion_options = {'linearize_tables': True}
masthead_url = 'http://www.people.com.cn/img/2010wb/images/logo.gif' masthead_url = 'http://www.people.com.cn/img/2010wb/images/logo.gif'
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
feeds = [ feeds = [
(u'时政', u'http://www.people.com.cn/rss/politics.xml'), (u'时政', u'http://www.people.com.cn/rss/politics.xml'),
(u'国际', u'http://www.people.com.cn/rss/world.xml'), (u'国际', u'http://www.people.com.cn/rss/world.xml'),

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from collections import OrderedDict from collections import OrderedDict
from calibre import browser from calibre import browser
@ -31,19 +33,29 @@ class PhilosophyNow(BasicNewsRecipe):
.articleImageCaption { font-size:small; text-align:center; } .articleImageCaption { font-size:small; text-align:center; }
em, blockquote { color:#202020; } em, blockquote { color:#202020; }
''' '''
recipe_specific_options = {
'issue': {
'short': 'Enter the Issue Number you want to download ',
'long': 'For example, 136'
}
}
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('https://philosophynow.org/') soup = self.index_to_soup('https://philosophynow.org/')
div = soup.find('div', attrs={'id': 'aside_issue_cover'}) div = soup.find('div', attrs={'id': 'aside_issue_cover'})
url = div.find('a', href=True)['href'] url = 'https://philosophynow.org' + div.find('a', href=True)['href']
issue = div.find('div', attrs={'id':'aside_issue_text'})
if issue: d = self.recipe_specific_options.get('issue')
self.log('Downloading issue:', self.tag_to_string(issue).strip()) if d and isinstance(d, str):
self.timefmt = ' [' + self.tag_to_string(issue.find(attrs={'id':'aside_issue_date'})) + ']' url = 'https://philosophynow.org/issues/' + d
self.title = 'Philosophy Now ' + self.tag_to_string(issue.find(attrs={'id':'aside_issue_number'}))
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'id': 'issue_contents_cover_div'})
cov_url = div.find('img', src=True)['src'] cov_url = div.find('img', src=True)['src']
self.cover_url = 'https://philosophynow.org' + cov_url self.cover_url = 'https://philosophynow.org' + cov_url
soup = self.index_to_soup('https://philosophynow.org' + url) self.timefmt = ' [' + self.tag_to_string(soup.find('h1')) + ']'
feeds = OrderedDict() feeds = OrderedDict()

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
''' '''
https://sciencex.com/ https://sciencex.com/
''' '''
@ -26,6 +28,20 @@ class scix(BasicNewsRecipe):
.article__info, .article-byline, .article-main__more, .d-print-block {font-size:small; color:#404040;} .article__info, .article-byline, .article-main__more, .d-print-block {font-size:small; color:#404040;}
''' '''
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
resolve_internal_links = True resolve_internal_links = True
remove_empty_feeds = True remove_empty_feeds = True

View File

@ -59,16 +59,29 @@ class ScientificAmerican(BasicNewsRecipe):
br.submit() br.submit()
return br return br
recipe_specific_options = {
'issue_url': {
'short': 'The issue URL ',
'long': (
'For example, https://www.scientificamerican.com/issue/sa/2024/07-01/'
'\nYou can also download special-editions, physics, health, mind magazines by pasting the URL here.'
)
}
}
def parse_index(self): def parse_index(self):
# Get the cover, date and issue URL # Get the cover, date and issue URL
fp_soup = self.index_to_soup("https://www.scientificamerican.com") d = self.recipe_specific_options.get('issue_url')
curr_issue_link = fp_soup.find(**prefixed_classes('latest_issue_links-')) if d and isinstance(d, str):
if not curr_issue_link: issue = d
self.abort_recipe_processing("Unable to find issue link") else:
issue_url = 'https://www.scientificamerican.com' + curr_issue_link.a["href"] fp_soup = self.index_to_soup("https://www.scientificamerican.com")
# for past editions https://www.scientificamerican.com/archive/issues/ curr_issue_link = fp_soup.find(**prefixed_classes('latest_issue_links-'))
# issue_url = 'https://www.scientificamerican.com/issue/sa/2024/01-01/' if not curr_issue_link:
soup = self.index_to_soup(issue_url) self.abort_recipe_processing("Unable to find issue link")
issue = 'https://www.scientificamerican.com' + curr_issue_link.a["href"]
soup = self.index_to_soup(issue)
script = soup.find("script", id="__DATA__") script = soup.find("script", id="__DATA__")
if not script: if not script:
self.abort_recipe_processing("Unable to find script") self.abort_recipe_processing("Unable to find script")