mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
67 lines
2.6 KiB
Python
67 lines
2.6 KiB
Python
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
|
|
|
CATEGORIES = {
|
|
'smart-news': 'Smart News',
|
|
'history': 'History',
|
|
'science-nature': 'Science',
|
|
'innovation': 'Innovation',
|
|
'arts-culture': 'Arts & Culture',
|
|
'travel': 'Travel',
|
|
'smithsonian-institution': 'At the Smithsonian'
|
|
}
|
|
|
|
|
|
class Smithsonian(BasicNewsRecipe):
|
|
|
|
title = 'Smithsonian Magazine'
|
|
__author__ = 'Kovid Goyal'
|
|
|
|
description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' # noqa: E501
|
|
language = 'en'
|
|
category = 'news'
|
|
encoding = 'UTF-8'
|
|
keep_only_tags = [
|
|
classes('article-header articleLeft')
|
|
]
|
|
remove_tags = [
|
|
classes(
|
|
'hidden-phone hidden-tablet hidden-desktop slideshow-nav associated-container'
|
|
' widget-article-pixel tag-list recommended-videos comments'
|
|
)
|
|
]
|
|
no_javascript = True
|
|
no_stylesheets = True
|
|
|
|
def parse_section(self, url):
|
|
soup = self.index_to_soup(url)
|
|
seen = set()
|
|
for al in soup.findAll(attrs={'class': 'article-list'}):
|
|
for article in al.findAll(attrs={'class': 'article-list-item'}):
|
|
div = article.find(attrs={'class': 'article-list-text'})
|
|
a = div.find('a')
|
|
title = self.tag_to_string(a)
|
|
if title in seen:
|
|
continue
|
|
seen.add(title)
|
|
url = 'https://www.smithsonianmag.com/' + a['href'].lstrip('/')
|
|
if '/tag/' in url:
|
|
continue
|
|
desc = ''
|
|
p = div.find(attrs={'class': 'article-list-text'})
|
|
if p is not None:
|
|
desc = self.tag_to_string(p)
|
|
self.log('\t' + title)
|
|
yield {'title': title, 'url': url, 'description': desc}
|
|
|
|
def parse_index(self):
|
|
ans = []
|
|
for slug, title in CATEGORIES.items():
|
|
url = 'https://www.smithsonianmag.com/category/' + slug + '/'
|
|
self.log('Parsing section:', title, 'at:', url)
|
|
articles = list(self.parse_section(url))
|
|
if articles:
|
|
ans.append((title, articles))
|
|
if self.test and len(ans) >= self.test[0]:
|
|
break
|
|
return ans
|