calibre/recipes/smith.recipe

from calibre.web.feeds.news import BasicNewsRecipe, classes

CATEGORIES = {
    'smart-news': 'Smart News',
    'history': 'History',
    'science-nature': 'Science',
    'innovation': 'Innovation',
    'arts-culture': 'Arts & Culture',
    'travel': 'Travel',
    'smithsonian-institution': 'At the Smithsonian'
}


class Smithsonian(BasicNewsRecipe):

    title = 'Smithsonian Magazine'
    __author__ = 'Kovid Goyal'

    description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'  # noqa: E501
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    keep_only_tags = [
        classes('article-header articleLeft')
    ]
    remove_tags = [
        classes(
            'hidden-phone hidden-tablet hidden-desktop slideshow-nav associated-container'
            ' widget-article-pixel tag-list recommended-videos comments'
        )
    ]
    no_javascript = True
    no_stylesheets = True

    def parse_section(self, url):
        soup = self.index_to_soup(url)
        seen = set()
        for al in soup.findAll(attrs={'class': 'article-list'}):
            for article in al.findAll(attrs={'class': 'article-list-item'}):
                div = article.find(attrs={'class': 'article-list-text'})
                a = div.find('a')
                title = self.tag_to_string(a)
                if title in seen:
                    continue
                seen.add(title)
                url = 'https://www.smithsonianmag.com/' + a['href'].lstrip('/')
                if '/tag/' in url:
                    continue
                desc = ''
                p = div.find(attrs={'class': 'article-list-text'})
                if p is not None:
                    desc = self.tag_to_string(p)
                self.log('\t' + title)
                yield {'title': title, 'url': url, 'description': desc}

    def parse_index(self):
        ans = []
        for slug, title in CATEGORIES.items():
            url = 'https://www.smithsonianmag.com/category/' + slug + '/'
            self.log('Parsing section:', title, 'at:', url)
            articles = list(self.parse_section(url))
            if articles:
                ans.append((title, articles))
            if self.test and len(ans) >= self.test[0]:
                break
        return ans