#!/usr/bin/env python2 # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2018, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import datetime import re from calibre import strftime from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag is_web_edition = False oldest_web_edition_article = 7 # days # The sections to download when downloading the web edition, comment out # the section you are not interested in web_sections = [ ('World', 'world'), ('U.S.', 'us'), ('Politics', 'politics'), ('New York', 'nyregion'), ('Business', 'business'), ('Technology', 'technology'), ('Sports', 'sports'), ('Science', 'science'), ('Health', 'health'), ('Opinion', 'opinion'), ('Arts', 'arts'), ('Books', 'books'), ('Movies', 'movies'), ('Music', 'arts/music'), ('Television', 'arts/television'), ('Style', 'style'), ('Dining & Wine', 'dining'), ('Fashion & Style', 'fashion'), ('Home & Garden', 'garden'), ('Travel', 'travel'), ('Education', 'education'), ('Multimedia', 'multimedia'), ('Obituaries', 'obituaries'), ('Sunday Magazine', 'magazine') ] url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/') def date_from_url(url): m = url_date_pat.search(url) if m is not None: return datetime.date(*map(int, m.groups())) def format_date(d): try: return strftime(' [%a, %d %b %Y]', d) except Exception: return strftime(' [%Y/%m/%d]', d) def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) class NewYorkTimes(BasicNewsRecipe): title = 'The New York Times' if is_web_edition: description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.' else: description = 'Today\'s New York Times' encoding = 'utf-8' __author__ = 'Kovid Goyal' language = 'en' ignore_duplicate_articles = {'title', 'url'} no_stylesheets = True compress_news_images = True compress_news_images_auto_size = 5 remove_attributes = ['style'] remove_tags = [ dict(attrs={'aria-label':'tools'.split()}), dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}), dict(href='#site-content #site-index'.split()), dict(attrs={'aria-hidden':'true'}), dict(attrs={'data-videoid':True}), dict(name='button meta link'.split()), dict(id=lambda x: x and x.startswith('story-ad-')), dict(name='head'), dict(role='toolbar'), dict(name='a', href=lambda x: x and '#story-continues-' in x), dict(name='a', href=lambda x: x and '#whats-next' in x), dict(id=lambda x: x and 'sharetools-' in x), dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()), classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'), dict(attrs={'class': lambda x: x and ( 'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}), ] def preprocess_html(self, soup): article = soup.find(id='story') # The NYT is apparently A/B testing a new page layout has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None if has_supplemental: keep_only_tags = [ dict(id='story-header'), classes('story-body-supplemental story-interrupter'), ] else: keep_only_tags = [ dict(id='story'), ] body = Tag(soup, 'body') for spec in keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) # Add a space to the dateline t = soup.find(**classes('dateline')) if t is not None: t.insert(0, ' ') # Remove empty li tags for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}): if not li.contents and not li.string: li.extract() # Ensure the headline is first h1 = soup.find('h1', itemprop='headline') if h1 is not None: h1.extract() soup.find('body').contents.insert(0, h1) return soup def read_nyt_metadata(self): INDEX = 'https://www.nytimes.com/section/todayspaper' # INDEX = 'file:///t/raw.html' soup = self.index_to_soup(INDEX) pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content'] date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False) self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d')) self.timefmt = strftime(' [%d %b, %Y]', date) return soup def parse_todays_sections(self, container): for h2 in container.findAll('h2', **classes('headline')): title = self.tag_to_string(h2) a = h2.find('a', href=True) url = a['href'] if '?' in url: url = url.split('?')[0] p = h2.findParent(**classes('story-body')) desc = '' if p is not None: s = p.find(**classes('summary')) if s is not None: desc = self.tag_to_string(s) date = '' d = date_from_url(url) if d is not None: date = format_date(d) self.log('\t', title + date, ': ', url) self.log('\t\t', desc) yield {'title': title, 'url': url, 'description': desc, 'date': date} def parse_todays_page(self): soup = self.read_nyt_metadata() section = soup.find(id=lambda x: x and x.startswith('collection-todays-new-york-times')) feeds = [] for i, h1 in enumerate(section.findAll('h1')): if i == 0: continue section_title = self.tag_to_string(h1) self.log('Found section:', section_title) if i == 1: container = h1.parent articles = list(self.parse_todays_sections(container)) articles += list(self.parse_todays_sections(container.findNextSibling('div'))) else: articles = list(self.parse_todays_sections(h1.findNextSibling('ol'))) if articles: feeds.append((section_title, articles)) return feeds def parse_highlights(self, container): for article in container.findAll('article', **classes('story')): h2 = article.find('h2') if h2 is not None: title = self.tag_to_string(h2) a = h2.find('a', href=True) if a is not None: url = a['href'] desc = '' p = article.find(**classes('summary')) if p is not None: desc = self.tag_to_string(p) date = '' d = date_from_url(url) if d is not None: date = format_date(d) today = datetime.date.today() delta = today - d if delta.days > oldest_web_edition_article: self.log.debug('\tSkipping article', title, 'as it is too old') continue yield {'title': title, 'url': url, 'description': desc, 'date': date} def parse_web_section(self, soup, slug): def log(article): self.log('\t', article['title'] + article['date'], ':', article['url']) if article.get('description'): self.log('\t\t', article['description']) container = soup.find(itemtype='http://schema.org/CollectionPage') highlights = container.find('section', **classes('highlights')) for article in self.parse_highlights(highlights): log(article) yield article extra = container.find('section', attrs={'data-collection-type': True}) if extra is not None: title = self.tag_to_string(extra.find('h2')) for article in self.parse_highlights(extra): article['title'] = '{}: {}'.format(title, article['title']) log(article) yield article def parse_web_sections(self): self.read_nyt_metadata() feeds = [] for section_title, slug in web_sections: url = 'https://www.nytimes.com/section/' + slug try: soup = self.index_to_soup(url) except Exception: self.log.error('Failed to download section:', url) continue self.log('Found section:', section_title) articles = list(self.parse_web_section(soup, slug)) if articles: feeds.append((section_title, articles)) if self.test and len(feeds) >= self.test[0]: break return feeds def parse_index(self): # return [('All articles', [ # {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2018/05/31/theater/best-25-plays-how-we-made-the-list.html'}, # ])] if is_web_edition: return self.parse_web_sections() return self.parse_todays_page() # The NYT occassionally returns bogus articles for some reason just in case # it is because of cookies, dont store cookies def get_browser(self, *args, **kwargs): return self def clone_browser(self, *args, **kwargs): return self.get_browser() def open_novisit(self, *args, **kwargs): from calibre import browser br = browser() response = br.open_novisit(*args, **kwargs) # headers = response.info() # if headers.get('X-PageType') == 'vi-story': # import tempfile # with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f: # f.write(response.read()) # import time # time.sleep(1) # br = browser() # response = br.open_novisit(*args, **kwargs) return response open = open_novisit