#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2018, Kovid Goyal import datetime import json import re from pprint import pprint from calibre import strftime from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe is_web_edition = False use_wayback_machine = False # This is an Apollo persisted query hash which you can get # from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper # or by https://www.nytimes.com/section/world persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7' # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ 'world', 'us', 'politics', 'nyregion', 'business', 'technology', 'sports', 'science', 'health', 'opinion', 'arts', 'books', 'movies', 'arts/music', 'arts/television', 'style', 'food', 'fashion', 'travel', 'education', 'multimedia', 'obituaries', 'magazine', ] # web_sections = [ 'business' ] url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') def date_from_url(url): m = url_date_pat.search(url) if m is not None: return datetime.date(*map(int, m.groups())) def format_date(d): try: return strftime(' [%a, %d %b %Y]', d) except Exception: return strftime(' [%Y/%m/%d]', d) def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None) def absolutize_href(href): if not href.startswith('http'): href = 'https://www.nytimes.com/' + href.lstrip('/') return href class NewYorkTimes(BasicNewsRecipe): if is_web_edition: title = 'The New York Times (Web)' description = ( 'New York Times (Web). You can edit the recipe to remove sections you are not interested in. ' 'Use advanced menu to make changes to fetch Todays Paper' ) else: title = 'The New York Times' description = ( 'New York Times. Todays Paper ' 'Use advanced menu to make changes to fetch Web Edition' ) encoding = 'utf-8' __author__ = 'Kovid Goyal' language = 'en_US' ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True oldest_web_edition_article = 7 # days extra_css = ''' .byl, .time { font-size:small; color:#202020; } .cap { font-size:small; text-align:center; } .cred { font-style:italic; font-size:small; } em, blockquote { color: #202020; } .sc { font-variant: small-caps; } .lbl { font-size:small; color:#404040; } img { display:block; margin:0 auto; } ''' @property def nyt_parser(self): ans = getattr(self, '_nyt_parser', None) if ans is None: from calibre.live import load_module self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') return ans def get_nyt_page(self, url, skip_wayback=False): if use_wayback_machine and not skip_wayback: from calibre import browser return self.nyt_parser.download_url(url, browser()) return self.index_to_soup(url, raw=True) def preprocess_raw_html(self, raw_html, url): return self.nyt_parser.extract_html(self.index_to_soup(raw_html), url) articles_are_obfuscated = use_wayback_machine if use_wayback_machine: def get_obfuscated_article(self, url): from calibre.ptempfile import PersistentTemporaryFile with PersistentTemporaryFile() as tf: tf.write(self.get_nyt_page(url)) return tf.name recipe_specific_options = { 'web': { 'short': 'Type in yes, if you want ' + ('Todays Paper' if is_web_edition else 'Web Edition'), 'default': 'Web Edition' if is_web_edition else 'Todays Paper', }, 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)', 'default': str(oldest_web_edition_article) }, 'date': { 'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper', 'long': 'For example, 2024/07/16' }, 'res': { 'short': ( 'For hi-res images, select a resolution from the following\noptions: ' 'popup, jumbo, mobileMasterAt3x, superJumbo' ), 'long': ( 'This is useful for non e-ink devices, and for a lower file size\nthan ' 'the default, use mediumThreeByTwo440, mediumThreeByTwo225, articleInline.' ), }, 'comp': { 'short': 'Compress News Images?', 'long': 'enter yes', 'default': 'no' } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) c = self.recipe_specific_options.get('comp') d = self.recipe_specific_options.get('days') w = self.recipe_specific_options.get('web') self.is_web_edition = is_web_edition if w and isinstance(w, str): if w == 'yes': self.is_web_edition = not is_web_edition if d and isinstance(d, str): self.oldest_web_edition_article = float(d) if c and isinstance(c, str): if c.lower() == 'yes': self.compress_news_images = True def todays_paper_url(self): pdate = self.recipe_specific_options.get('date') if pdate and isinstance(pdate, str): return 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times'.format(pdate) return 'https://www.nytimes.com/section/todayspaper' def parse_todays_page(self): url = self.todays_paper_url() soup = self.index_to_soup(url) return parse_todays_page(soup) def parse_web_sections(self): feeds = [] for slug in web_sections: url = 'https://www.nytimes.com/section/' + slug self.log('Download section index:', url) soup = self.index_to_soup(url) # with open('/t/raw.html', 'w') as f: # f.write(str(soup)) section_title, articles = parse_web_section(soup) self.log('Section:', section_title) if articles: feeds.append((section_title, articles)) for a in articles: self.log('\t', a['title'], a['url']) else: self.log(' No articles found in section:', section_title) if self.test and len(feeds) >= self.test[0]: break return feeds def parse_index(self): # return [('All articles', [ # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'}, # ])] date, feeds = self.parse_todays_page() pdate = date.strftime('%Y/%m/%d') self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate) self.timefmt = strftime(' [%d %b, %Y]', date) if self.is_web_edition: return self.parse_web_sections() for s, articles in feeds: self.log('Section:', s) for a in articles: self.log('\t', a['title'], a['url']) return feeds def get_browser(self, *args, **kwargs): kwargs['user_agent'] = 'User-Agent: Mozilla/5.0 (compatible; archive.org_bot; Wayback Machine Live Record; +http://archive.org/details/archive.org_bot)' br = BasicNewsRecipe.get_browser(self, *args, **kwargs) return br def preprocess_html(self, soup): w = self.recipe_specific_options.get('res') if w and isinstance(w, str): res = '-' + w for img in soup.findAll('img', attrs={'src':True}): if '-article' in img['src']: ext = img['src'].split('?')[0].split('.')[-1] img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext for c in soup.findAll('div', attrs={'class':'cap'}): for p in c.findAll(['p', 'div']): p.name = 'span' return soup def get_article_url(self, article): url = BasicNewsRecipe.get_article_url(self, article) if not re.search(r'/video/|/athletic/|/card/', url): return url self.log('\tSkipping ', url) def preloaded_data(soup): from calibre.web.site_parsers.nytimes import clean_js_json candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x) script = candidates[0] script = str(script) raw = script[script.find('{') : script.rfind(';')].strip().rstrip(';') # } raw = clean_js_json(raw) # with open('/t/raw.json', 'w') as f: # f.write(raw) return json.JSONDecoder(strict=False).raw_decode(raw)[0]['initialState'] def asset_to_article(asset): title = asset['headline']['default'] return {'title': title, 'url': asset['url'], 'description': asset['summary']} def parse_web_section(soup): data = preloaded_data(soup) article_map = {} for k, v in data.items(): if v['__typename'] == 'Article': article_map[k] = asset_to_article(v) articles = [] for k, v in data['ROOT_QUERY'].items(): if k.startswith('workOrLocation'): c = data[v['__ref']] section_title = c['name'] for k, v in c['collectionsPage'].items(): if k.startswith('stream'): for k, v in v.items(): if k.startswith('edges'): for q in v: r = q['node']['__ref'] if r.startswith('Article:'): articles.append(article_map[r]) if not articles: for c in c['collectionsPage']['embeddedCollections']: for e in c['stream']['edges']: for k, v in e.items(): if k.startswith('node'): articles.append(article_map[v['__ref']]) return section_title, articles def parse_todays_page(soup): m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/') pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False) article_map = {} data = preloaded_data(soup) for k, v in data.items(): if v['__typename'] == 'Article': article_map[k] = asset_to_article(v) feeds = [] for k, v in data['ROOT_QUERY'].items(): if k.startswith('workOrLocation'): for g in data[v['__ref']]['groupings']: for c in g['containers']: articles = [] for r in c['relations']: ref = r['asset']['__ref'] if ref in article_map: articles.append(article_map[ref]) if articles: feeds.append((c['label'], articles)) return pdate, feeds if __name__ == '__main__': import sys with open(sys.argv[-1]) as f: html = f.read() soup = BeautifulSoup(html) if is_web_edition: section_title, articles = parse_web_section(soup) print(section_title) pprint(articles) else: pdate, feeds = parse_todays_page(soup) print(pdate) pprint(feeds)