From ceafc1b05e2024ece3f750b758f3c482a993ee93 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 12 Sep 2022 07:40:24 +0530 Subject: [PATCH] Update NYTimes --- recipes/nytimes.recipe | 112 ++++--------------------------------- recipes/nytimes_sub.recipe | 112 ++++--------------------------------- 2 files changed, 24 insertions(+), 200 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index fa862bc105..944502fa93 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -3,20 +3,20 @@ # License: GPLv3 Copyright: 2018, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals - import datetime -import re import json +import re from pprint import pprint # noqa from calibre import strftime +from calibre.ebooks.BeautifulSoup import Tag from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag is_web_edition = True oldest_web_edition_article = 7 # days + # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ @@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe): remove_attributes = ['style'] conversion_options = {'flow_size': 0} - remove_tags = [ - dict(attrs={'aria-label':'tools'.split()}), - dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}), - dict(href='#site-content #site-index'.split()), - dict(attrs={'aria-hidden':'true'}), - dict(attrs={'data-videoid':True}), - dict(name='button meta link time source'.split()), - dict(id=lambda x: x and x.startswith('story-ad-')), - dict(name='head'), - dict(role='toolbar'), - dict(name='a', href=lambda x: x and '#story-continues-' in x), - dict(name='a', href=lambda x: x and '#whats-next' in x), - dict(id=lambda x: x and 'sharetools-' in x), - dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()), - classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'), - dict(attrs={'class': lambda x: x and ( - 'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}), - ] - - def preprocess_html(self, soup): - article = soup.find(id='story') - if article is None: - keep_only_tags = [dict(attrs={'aria-label': 'Main content'})] - else: - # The NYT is apparently A/B testing a new page layout - has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None - if has_supplemental: - keep_only_tags = [ - dict(id='story-header'), - classes('story-body-supplemental story-interrupter'), - ] - else: - keep_only_tags = [ - dict(id='story'), - ] - body = new_tag(soup, 'body') - for spec in keep_only_tags: - for tag in soup.find('body').findAll(**spec): - body.insert(len(body.contents), tag) - soup.find('body').replaceWith(body) - - # Add a space to the dateline - t = soup.find(**classes('dateline')) - if t is not None: - t.insert(0, ' ') - - # Remove empty li tags - for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}): - if not li.contents and not li.string: - li.extract() - - # Ensure the headline is first - h1 = soup.find('h1', itemprop='headline') - if h1 is not None: - h1.extract() - soup.find('body').contents.insert(0, h1) - - # Find lazy loaded images - for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True): - if div.find('img') is None: - span = div.find('span') - if span is not None and self.tag_to_string(span).strip().lower() == 'image': - span.name = 'img' - span['src'] = div['itemid'] - - # Remove live storline menu - for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}): - span.parent.extract() - - return soup + def preprocess_raw_html(self, raw_html, url): + if '/live/' in url: + self.abort_article('Cant be bothered decoding the JSON for NYT live articles') + if not hasattr(self, 'nyt_parser'): + from calibre.live import load_module + m = load_module('calibre.web.site_parsers.nytimes') + self.nyt_parser = m + html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) + return html def read_todays_paper(self): INDEX = 'https://www.nytimes.com/section/todayspaper' @@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe): if is_web_edition: return self.parse_web_sections() return self.parse_todays_page() - - # The NYT occasionally returns bogus articles for some reason just in case - # it is because of cookies, dont store cookies - def get_browser(self, *args, **kwargs): - return self - - def clone_browser(self, *args, **kwargs): - return self.get_browser() - - def open_novisit(self, *args, **kwargs): - from calibre import browser, random_user_agent - if not hasattr(self, 'rua_stored'): - self.rua_stored = random_user_agent(allow_ie=False) - br = browser(user_agent=self.rua_stored) - response = br.open_novisit(*args, **kwargs) - # headers = response.info() - # if headers.get('X-PageType') == 'vi-story': - # import tempfile - # with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f: - # f.write(response.read()) - # import time - # time.sleep(1) - # br = browser() - # response = br.open_novisit(*args, **kwargs) - return response - - open = open_novisit diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 2490eb1c5a..28f5e3582e 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -3,20 +3,20 @@ # License: GPLv3 Copyright: 2018, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals - import datetime -import re import json +import re from pprint import pprint # noqa from calibre import strftime +from calibre.ebooks.BeautifulSoup import Tag from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import Tag is_web_edition = False oldest_web_edition_article = 7 # days + # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ @@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe): remove_attributes = ['style'] conversion_options = {'flow_size': 0} - remove_tags = [ - dict(attrs={'aria-label':'tools'.split()}), - dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}), - dict(href='#site-content #site-index'.split()), - dict(attrs={'aria-hidden':'true'}), - dict(attrs={'data-videoid':True}), - dict(name='button meta link time source'.split()), - dict(id=lambda x: x and x.startswith('story-ad-')), - dict(name='head'), - dict(role='toolbar'), - dict(name='a', href=lambda x: x and '#story-continues-' in x), - dict(name='a', href=lambda x: x and '#whats-next' in x), - dict(id=lambda x: x and 'sharetools-' in x), - dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()), - classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'), - dict(attrs={'class': lambda x: x and ( - 'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}), - ] - - def preprocess_html(self, soup): - article = soup.find(id='story') - if article is None: - keep_only_tags = [dict(attrs={'aria-label': 'Main content'})] - else: - # The NYT is apparently A/B testing a new page layout - has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None - if has_supplemental: - keep_only_tags = [ - dict(id='story-header'), - classes('story-body-supplemental story-interrupter'), - ] - else: - keep_only_tags = [ - dict(id='story'), - ] - body = new_tag(soup, 'body') - for spec in keep_only_tags: - for tag in soup.find('body').findAll(**spec): - body.insert(len(body.contents), tag) - soup.find('body').replaceWith(body) - - # Add a space to the dateline - t = soup.find(**classes('dateline')) - if t is not None: - t.insert(0, ' ') - - # Remove empty li tags - for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}): - if not li.contents and not li.string: - li.extract() - - # Ensure the headline is first - h1 = soup.find('h1', itemprop='headline') - if h1 is not None: - h1.extract() - soup.find('body').contents.insert(0, h1) - - # Find lazy loaded images - for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True): - if div.find('img') is None: - span = div.find('span') - if span is not None and self.tag_to_string(span).strip().lower() == 'image': - span.name = 'img' - span['src'] = div['itemid'] - - # Remove live storline menu - for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}): - span.parent.extract() - - return soup + def preprocess_raw_html(self, raw_html, url): + if '/live/' in url: + self.abort_article('Cant be bothered decoding the JSON for NYT live articles') + if not hasattr(self, 'nyt_parser'): + from calibre.live import load_module + m = load_module('calibre.web.site_parsers.nytimes') + self.nyt_parser = m + html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) + return html def read_todays_paper(self): INDEX = 'https://www.nytimes.com/section/todayspaper' @@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe): if is_web_edition: return self.parse_web_sections() return self.parse_todays_page() - - # The NYT occasionally returns bogus articles for some reason just in case - # it is because of cookies, dont store cookies - def get_browser(self, *args, **kwargs): - return self - - def clone_browser(self, *args, **kwargs): - return self.get_browser() - - def open_novisit(self, *args, **kwargs): - from calibre import browser, random_user_agent - if not hasattr(self, 'rua_stored'): - self.rua_stored = random_user_agent(allow_ie=False) - br = browser(user_agent=self.rua_stored) - response = br.open_novisit(*args, **kwargs) - # headers = response.info() - # if headers.get('X-PageType') == 'vi-story': - # import tempfile - # with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f: - # f.write(response.read()) - # import time - # time.sleep(1) - # br = browser() - # response = br.open_novisit(*args, **kwargs) - return response - - open = open_novisit