Updates recipe_specific_options

This commit is contained in:
unkn0w7n 2024-07-22 11:08:00 +05:30
parent bdbfdf0f43
commit 16a1f2890b
8 changed files with 141 additions and 26 deletions

View File

@ -47,17 +47,30 @@ class IndiaToday(BasicNewsRecipe):
def preprocess_raw_html(self, raw_html, url):
return raw_html.replace('—', '--')
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (DD-MM-YYYY format)',
'long': 'For example, 22-07-2024'
}
}
def get_cover_url(self):
soup = self.index_to_soup(
'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
)
for citem in soup.findAll(
'meta', content=lambda s: s and s.endswith('/magazine/300/new')
):
return citem['content'].replace('300', '600')
d = self.recipe_specific_options.get('date')
if not (d and isinstance(d, str)):
soup = self.index_to_soup(
'https://www.readwhere.com/magazine/the-india-today-group/India-Today/1154'
)
for citem in soup.findAll(
'meta', content=lambda s: s and s.endswith('/magazine/300/new')
):
return citem['content'].replace('300', '600')
def parse_index(self):
soup = self.index_to_soup('https://www.indiatoday.in/magazine')
issue = https://www.indiatoday.in/magazine'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
issue = issue + '/' + d
soup = self.index_to_soup(issue)
section = None
sections = {}

View File

@ -81,7 +81,7 @@ class Liberation(BasicNewsRecipe):
'les mutations des sociétés et des cultures.'
)
language = 'fr'
oldest_article = 1
oldest_article = 1.15
remove_empty_feeds = True
articles_are_obfuscated = True
ignore_duplicate_articles = {'title', 'url'}
@ -94,6 +94,20 @@ class Liberation(BasicNewsRecipe):
blockquote { color:#202020; }
'''
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
feeds = [
('A la une', 'https://www.liberation.fr/arc/outboundfeeds/rss-all/collection/accueil-une/?outputType=xml'),
('Politique', 'https://www.liberation.fr/arc/outboundfeeds/rss-all/category/politique/?outputType=xml'),

View File

@ -19,6 +19,20 @@ class LiveMint(BasicNewsRecipe):
remove_attributes = ['style', 'height', 'width']
masthead_url = 'https://images.livemint.com/static/livemint-logo-v1.svg'
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
remove_empty_feeds = True
resolve_internal_links = True

View File

@ -34,6 +34,13 @@ class outlook(BasicNewsRecipe):
classes('ads-box info-img-absolute mobile-info-id story-dec-time-mobile sb-also-read ads-box1')
]
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (DD-Month-YYYY format)',
'long': 'For example, 10-june-2024'
}
}
def get_browser(self):
return BasicNewsRecipe.get_browser(self, user_agent='common_words/based', verify_ssl_certificates=False)
@ -42,14 +49,27 @@ class outlook(BasicNewsRecipe):
'\n***\nif this recipe fails, report it on: '
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
)
soup = self.index_to_soup('https://www.outlookindia.com/magazine')
a = soup.find('a', attrs={'aria-label':'magazine-cover-image'})
self.cover_url = a.img['src'].split('?')[0]
url = a['href']
self.description = self.tag_to_string(a)
self.timefmt = ' [' + self.tag_to_string(a.div).strip() + ']'
self.log('Downloading issue:', url, self.timefmt)
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
url = 'https://www.outlookindia.com/magazine/' + d
else:
soup = self.index_to_soup('https://www.outlookindia.com/magazine')
a = soup.find('a', attrs={'aria-label':'magazine-cover-image'})
url = a['href']
self.log('Downloading issue:', url)
soup = self.index_to_soup(url)
cov = soup.find(attrs={'aria-label':'magazine-cover-image'})
self.cover_url = cov.img['src'].split('?')[0]
summ = soup.find(attrs={'data-test-id':'magazine-summary'})
if summ:
self.description = self.tag_to_string(summ)
tme = soup.find(attrs={'class':'arr__timeago'})
if tme:
self.timefmt = ' [' + self.tag_to_string(tme).strip() + ']'
ans = []

View File

@ -26,6 +26,20 @@ class RT_eng(BasicNewsRecipe):
remove_attributes = ['height', 'width', 'style']
publication_type = 'newsportal'
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
extra_css = '''
img {display:block; margin:0 auto;}
em { color:#202020; }

View File

@ -56,8 +56,19 @@ class spectator(BasicNewsRecipe):
]
return br
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (DD-MM-YYYY format)',
'long': 'For example, 20-07-2024'
}
}
def parse_index(self):
soup = self.index_to_soup('https://www.spectator.co.uk/magazine')
index = 'https://www.spectator.co.uk/magazine'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
index = index + '/' + d + '/'
soup = self.index_to_soup(index)
self.cover_url = soup.find(**classes(
'magazine-header__container')).img['src'].split('?')[0]
issue = self.tag_to_string(soup.find(**classes(

View File

@ -29,17 +29,32 @@ class TheWeek(BasicNewsRecipe):
.article-info { font-size:small; }
'''
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYY.MM.DD format)',
'long': 'For example, 2024.06.30'
}
}
def get_cover_url(self):
soup = self.index_to_soup(
'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
)
for citem in soup.findAll(
'meta', content=lambda s: s and s.endswith('view/3.jpg')
):
return citem['content']
d = self.recipe_specific_options.get('date')
if not (d and isinstance(d, str)):
soup = self.index_to_soup(
'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
)
for citem in soup.findAll(
'meta', content=lambda s: s and s.endswith('view/3.jpg')
):
return citem['content']
def parse_index(self):
soup = self.index_to_soup('https://www.theweek.in/theweek.html')
issue = 'https://www.theweek.in/theweek.html'
d = self.recipe_specific_options.get('date')
if d and isinstance(d, str):
issue = 'https://www.theweek.in/theweek.' + d + '.html'
soup = self.index_to_soup(issue)
ans = []
d = datetime.today()

View File

@ -38,7 +38,21 @@ class WSJ(BasicNewsRecipe):
resolve_internal_links = True
ignore_duplicate_articles = {'url', 'title'}
remove_empty_feeds = True
oldest_article = 1 # days
oldest_article = 1.2 # days
recipe_specific_options = {
'days': {
'short': 'Oldest article to download from this news source. In days ',
'long': 'For example, 0.5, gives you articles from the past 12 hours',
'default': str(oldest_article)
}
}
def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
d = self.recipe_specific_options.get('days')
if d and isinstance(d, str):
self.oldest_article = float(d)
extra_css = '''
#subhed, em { font-style:italic; color:#202020; }