From fdbf44e3bd3503ac20eaae0a68ae7ab5daba4c96 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 Jun 2022 10:23:48 +0530 Subject: [PATCH] Update The New York Times --- recipes/nytimes.recipe | 30 +++++++++++++++++++----------- recipes/nytimes_sub.recipe | 32 ++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 3d70106339..9370c0339d 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -98,7 +98,7 @@ class NewYorkTimes(BasicNewsRecipe): dict(href='#site-content #site-index'.split()), dict(attrs={'aria-hidden':'true'}), dict(attrs={'data-videoid':True}), - dict(name='button meta link'.split()), + dict(name='button meta link time source'.split()), dict(id=lambda x: x and x.startswith('story-ad-')), dict(name='head'), dict(role='toolbar'), @@ -113,17 +113,20 @@ class NewYorkTimes(BasicNewsRecipe): def preprocess_html(self, soup): article = soup.find(id='story') - # The NYT is apparently A/B testing a new page layout - has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None - if has_supplemental: - keep_only_tags = [ - dict(id='story-header'), - classes('story-body-supplemental story-interrupter'), - ] + if article is None: + keep_only_tags = [dict(attrs={'aria-label': 'Main content'})] else: - keep_only_tags = [ - dict(id='story'), - ] + # The NYT is apparently A/B testing a new page layout + has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None + if has_supplemental: + keep_only_tags = [ + dict(id='story-header'), + classes('story-body-supplemental story-interrupter'), + ] + else: + keep_only_tags = [ + dict(id='story'), + ] body = new_tag(soup, 'body') for spec in keep_only_tags: for tag in soup.find('body').findAll(**spec): @@ -153,6 +156,11 @@ class NewYorkTimes(BasicNewsRecipe): if span is not None and self.tag_to_string(span).strip().lower() == 'image': span.name = 'img' span['src'] = div['itemid'] + + # Remove live storline menu + for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}): + span.parent.extract() + return soup def read_todays_paper(self): diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 16bf745f52..8f07e7c3c7 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -98,7 +98,7 @@ class NewYorkTimes(BasicNewsRecipe): dict(href='#site-content #site-index'.split()), dict(attrs={'aria-hidden':'true'}), dict(attrs={'data-videoid':True}), - dict(name='button meta link'.split()), + dict(name='button meta link time source'.split()), dict(id=lambda x: x and x.startswith('story-ad-')), dict(name='head'), dict(role='toolbar'), @@ -113,17 +113,20 @@ class NewYorkTimes(BasicNewsRecipe): def preprocess_html(self, soup): article = soup.find(id='story') - # The NYT is apparently A/B testing a new page layout - has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None - if has_supplemental: - keep_only_tags = [ - dict(id='story-header'), - classes('story-body-supplemental story-interrupter'), - ] + if article is None: + keep_only_tags = [dict(attrs={'aria-label': 'Main content'})] else: - keep_only_tags = [ - dict(id='story'), - ] + # The NYT is apparently A/B testing a new page layout + has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None + if has_supplemental: + keep_only_tags = [ + dict(id='story-header'), + classes('story-body-supplemental story-interrupter'), + ] + else: + keep_only_tags = [ + dict(id='story'), + ] body = new_tag(soup, 'body') for spec in keep_only_tags: for tag in soup.find('body').findAll(**spec): @@ -153,6 +156,11 @@ class NewYorkTimes(BasicNewsRecipe): if span is not None and self.tag_to_string(span).strip().lower() == 'image': span.name = 'img' span['src'] = div['itemid'] + + # Remove live storline menu + for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}): + span.parent.extract() + return soup def read_todays_paper(self): @@ -310,7 +318,7 @@ class NewYorkTimes(BasicNewsRecipe): def parse_index(self): # return [('All articles', [ - # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2018/05/31/theater/best-25-plays-how-we-made-the-list.html'}, + # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # ])] if is_web_edition: return self.parse_web_sections()