From ecb302fa0498d7918941f881f2330233a3582282 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 16 Jan 2019 19:29:06 +0530 Subject: [PATCH] Update New York Times Book Review --- recipes/nytimesbook.recipe | 56 ++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index 2112226b31..afe5fe4e85 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -6,6 +6,12 @@ def classes(classes): return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) +def absolutize(url): + if url.startswith('/'): + url = 'https://www.nytimes.com' + url + return url + + class NewYorkTimesBookReview(BasicNewsRecipe): title = u'New York Times Book Review' language = 'en' @@ -18,42 +24,27 @@ class NewYorkTimesBookReview(BasicNewsRecipe): encoding = 'utf-8' keep_only_tags = [ - dict(id='story'), - ] - remove_tags = [ - dict(attrs={'aria-label':'tools'.split()}), - dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}), - dict(href='#site-content #site-index'.split()), - dict(attrs={'aria-hidden':'true'}), - dict(attrs={'data-videoid':True}), - dict(name='button meta link'.split()), - dict(id=lambda x: x and x.startswith('story-ad-')), - dict(name='head'), - dict(role='toolbar'), - dict(name='a', href=lambda x: x and '#story-continues-' in x), - dict(name='a', href=lambda x: x and '#whats-next' in x), - dict(id=lambda x: x and 'sharetools-' in x), - dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()), - classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad nav-wrapper'), - dict(attrs={'class': lambda x: x and ( - 'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}), + dict(name='h1'), + dict(attrs={'data-testid':'photoviewer-wrapper'}), + dict(itemprop=['author creator', 'articleBody']), ] def parse_index(self): soup = self.index_to_soup( - 'http://www.nytimes.com/pages/books/review/index.html') + 'https://www.nytimes.com/pages/books/review/index.html') # Find TOC - toc = soup.find('div', attrs={'class': 'rank'}) + toc = soup.find('section', id='collection-book-review').find('section').find('ol') main_articles, articles = [], [] feeds = [('Features', main_articles), ('Latest', articles)] - for h2 in toc.findAll('h2', attrs={'class': 'headline'}): + for li in toc.findAll('li'): + h2 = li.find('h2') a = h2.find('a', href=True) if a is not None: title = self.tag_to_string(a) - url = a['href'] + url = absolutize(a['href']) desc = '' - p = h2.findNextSibling('p', attrs={'class': 'summary'}) + p = h2.findNextSibling('p') if p: desc = self.tag_to_string(p) main_articles.append( @@ -61,14 +52,15 @@ class NewYorkTimesBookReview(BasicNewsRecipe): self.log('Found:', title, 'at', url) if desc: self.log('\t', desc) - for li in soup.find(id='latest-panel').find('ol').findAll('li'): - a = li.find('a', attrs={'class': 'story-link'}, href=True) - if a is None: - continue - url = a['href'] - m = a.find(attrs={'class': 'story-meta'}) - title = self.tag_to_string(m.find('h2')) - desc = self.tag_to_string(m.find(attrs={'class': 'summary'})) + for li in soup.find(id='stream-panel').find('ol').findAll('li'): + h2 = li.find('h2') + a = h2.findParent('a') + url = absolutize(a['href']) + p = h2.findNextSibling('p') + title = self.tag_to_string(h2) + desc = '' + if p: + desc = self.tag_to_string(p) articles.append({'title': title, 'url': url, 'description': desc}) self.log('Found:', title, 'at', url) if desc: