#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' theatlantic.com ''' from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class NewYorkMagazine(BasicNewsRecipe): title = 'New York Magazine' __author__ = 'Kovid Goyal' description = 'Food, culture, arts and entertainment in New York' language = 'en_US' no_stylesheets = True remove_javascript = True encoding = 'utf-8' extra_css = ''' .nym-image-figcaption, .bylines, .rubric, .clay-paragraph_prologue, .secondary-area-caption-credits { font-size: small; } ''' keep_only_tags = [ dict(name='article', attrs={'class': lambda x: x and 'article' in x.split()}) ] remove_tags = [ dict(name=['svg', 'iframe']), classes( 'related-stories start-discussion newsletter-flex-text package-toc ' 'comments-link tags related secondary-area author-photo error-pop-up' ), dict(id=['minibrowserbox', 'article-related', 'article-tools']), ] remove_attributes = ['style', 'height', 'width', 'srcset'] recipe_specific_options = { 'date': { 'short': 'The date of the edition to download (YYYY-MM-DD format)', 'long': 'For example, 2024-07-01', } } def nymag_get_index(self): issue_url = 'https://nymag.com/maglinks/nym-home-05' d = self.recipe_specific_options.get('date') if d and isinstance(d, str): issue_url = 'https://nymag.com/magazine/toc/' + d + '.html' return self.index_to_soup(issue_url) def parse_index(self): soup = self.nymag_get_index() cdiv = soup.find(**classes('magazine-toc-cover-image-wrap')) if cdiv is not None: for source in cdiv.findAll('source', srcset=True): self.cover_url = source['srcset'].split()[0] self.log('Cover:', self.cover_url) break feeds = [] if cover_art := soup.find(**classes('magazine-toc-cover-text')): a = cover_art.find('a', **classes('headline-link')) c_url = a['href'] c_title = self.tag_to_string( a.find(**classes('magazine-toc-cover-headline')) ).strip() c_desc = self.tag_to_string( a.find(**classes('magazine-toc-cover-teaser')) ).strip() self.log('Cover Story', '\n\t', c_title, c_url) feeds.append(( 'Cover Story', [{'title': c_title, 'url': c_url, 'description': c_desc}], )) for div in soup.findAll(attrs={'data-editable': 'settingTitle'}): section = self.tag_to_string(div).strip().capitalize() articles = [] self.log(section) ul = div.findNextSibling('ul') for li in ul.findAll('li'): a = li.find(href=True, **classes('article-link')) if a is None: continue url = a['href'] h3 = li.find('h3') title = self.tag_to_string(h3) desc = '' teaser = h3.findNextSibling(**classes('teaser')) if teaser is not None: desc = self.tag_to_string(teaser) self.log('\t', title, url) articles.append({'title': title, 'url': url, 'description': desc}) if articles: feeds.append((section, articles)) return feeds def preprocess_html(self, soup): if lede := soup.findAll( 'div', attrs={'class': lambda x: x and 'lede-image-wrapper' in x.split()} ): if len(lede) > 1: lede[1].extract() for img in soup.findAll('img', attrs={'data-src': True}): img['src'] = img['data-src'] for h2 in soup.findAll(['h2', 'h3']): h2.name = 'h4' return soup