calibre/recipes/smith.recipe
un-pogaz 7e1edf1fbf add scoped noqa 'line too long' (auto-fix)
ruff 'E501' --add-noqa
2025-01-24 11:14:10 +01:00

67 lines
2.6 KiB
Python

from calibre.web.feeds.news import BasicNewsRecipe, classes
CATEGORIES = {
'smart-news': 'Smart News',
'history': 'History',
'science-nature': 'Science',
'innovation': 'Innovation',
'arts-culture': 'Arts & Culture',
'travel': 'Travel',
'smithsonian-institution': 'At the Smithsonian'
}
class Smithsonian(BasicNewsRecipe):
title = 'Smithsonian Magazine'
__author__ = 'Kovid Goyal'
description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' # noqa: E501
language = 'en'
category = 'news'
encoding = 'UTF-8'
keep_only_tags = [
classes('article-header articleLeft')
]
remove_tags = [
classes(
'hidden-phone hidden-tablet hidden-desktop slideshow-nav associated-container'
' widget-article-pixel tag-list recommended-videos comments'
)
]
no_javascript = True
no_stylesheets = True
def parse_section(self, url):
soup = self.index_to_soup(url)
seen = set()
for al in soup.findAll(attrs={'class': 'article-list'}):
for article in al.findAll(attrs={'class': 'article-list-item'}):
div = article.find(attrs={'class': 'article-list-text'})
a = div.find('a')
title = self.tag_to_string(a)
if title in seen:
continue
seen.add(title)
url = 'https://www.smithsonianmag.com/' + a['href'].lstrip('/')
if '/tag/' in url:
continue
desc = ''
p = div.find(attrs={'class': 'article-list-text'})
if p is not None:
desc = self.tag_to_string(p)
self.log('\t' + title)
yield {'title': title, 'url': url, 'description': desc}
def parse_index(self):
ans = []
for slug, title in CATEGORIES.items():
url = 'https://www.smithsonianmag.com/category/' + slug + '/'
self.log('Parsing section:', title, 'at:', url)
articles = list(self.parse_section(url))
if articles:
ans.append((title, articles))
if self.test and len(ans) >= self.test[0]:
break
return ans