mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
nytimes.com is apparently doing a lot of bot detection. Let's see if getting it from the Internet Archive works better. This goes via a caching proxy so that any article is only downloaded once from the archive regardless of how many users download using the recipe. This ensures the archive is not unduly loaded.
269 lines
9.8 KiB
Python
269 lines
9.8 KiB
Python
#!/usr/bin/env python
|
|
# vim:fileencoding=utf-8
|
|
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
import datetime
|
|
import json
|
|
import re
|
|
from pprint import pprint # noqa
|
|
|
|
from calibre import strftime
|
|
from calibre.ebooks.BeautifulSoup import Tag
|
|
from calibre.utils.date import strptime
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
is_web_edition = False
|
|
oldest_web_edition_article = 7 # days
|
|
use_wayback_machine = True
|
|
|
|
|
|
# The sections to download when downloading the web edition, comment out
|
|
# the section you are not interested in
|
|
web_sections = [
|
|
('World', 'world'),
|
|
('U.S.', 'us'),
|
|
('Politics', 'politics'),
|
|
('New York', 'nyregion'),
|
|
('Business', 'business'),
|
|
('Technology', 'technology'),
|
|
('Sports', 'sports'),
|
|
('Science', 'science'),
|
|
('Health', 'health'),
|
|
('Opinion', 'opinion'),
|
|
('Arts', 'arts'),
|
|
('Books', 'books'),
|
|
('Movies', 'movies'),
|
|
('Music', 'arts/music'),
|
|
('Television', 'arts/television'),
|
|
('Style', 'style'),
|
|
('Dining & Wine', 'dining'),
|
|
('Fashion & Style', 'fashion'),
|
|
('Home & Garden', 'garden'),
|
|
('Travel', 'travel'),
|
|
('Education', 'education'),
|
|
('Multimedia', 'multimedia'),
|
|
('Obituaries', 'obituaries'),
|
|
('Sunday Magazine', 'magazine')
|
|
]
|
|
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
|
|
|
|
|
|
def date_from_url(url):
|
|
m = url_date_pat.search(url)
|
|
if m is not None:
|
|
return datetime.date(*map(int, m.groups()))
|
|
|
|
|
|
def format_date(d):
|
|
try:
|
|
return strftime(' [%a, %d %b %Y]', d)
|
|
except Exception:
|
|
return strftime(' [%Y/%m/%d]', d)
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(attrs={
|
|
'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
|
|
|
|
def new_tag(soup, name, attrs=()):
|
|
impl = getattr(soup, 'new_tag', None)
|
|
if impl is not None:
|
|
return impl(name, attrs=dict(attrs))
|
|
return Tag(soup, name, attrs=attrs or None)
|
|
|
|
|
|
class NewYorkTimes(BasicNewsRecipe):
|
|
|
|
if is_web_edition:
|
|
title = 'The New York Times (Web)'
|
|
description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
|
|
else:
|
|
title = 'The New York Times'
|
|
description = 'Today\'s New York Times'
|
|
encoding = 'utf-8'
|
|
__author__ = 'Kovid Goyal'
|
|
language = 'en'
|
|
ignore_duplicate_articles = {'title', 'url'}
|
|
no_stylesheets = True
|
|
compress_news_images = True
|
|
compress_news_images_auto_size = 5
|
|
conversion_options = {'flow_size': 0}
|
|
delay = 0 if use_wayback_machine else 1
|
|
|
|
@property
|
|
def nyt_parser(self):
|
|
ans = getattr(self, '_nyt_parser', None)
|
|
if ans is None:
|
|
from calibre.live import load_module
|
|
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
|
return ans
|
|
|
|
def get_nyt_page(self, url):
|
|
if use_wayback_machine:
|
|
from calibre import browser
|
|
return self.nyt_parser.download_url(url, browser())
|
|
return self.browser.open_novisit(url).read()
|
|
|
|
def preprocess_raw_html(self, raw_html, url):
|
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
|
return html
|
|
|
|
articles_are_obfuscated = use_wayback_machine
|
|
|
|
if use_wayback_machine:
|
|
def get_obfuscated_article(self, url):
|
|
from calibre.ptempfile import PersistentTemporaryFile
|
|
with PersistentTemporaryFile() as tf:
|
|
tf.write(self.get_nyt_page(url))
|
|
return tf.name
|
|
|
|
def read_todays_paper(self):
|
|
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
|
# INDEX = 'file:///t/raw.html'
|
|
return self.index_to_soup(self.get_nyt_page(INDEX))
|
|
|
|
def read_nyt_metadata(self):
|
|
soup = self.read_todays_paper()
|
|
pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
|
|
date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
|
|
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
|
|
self.timefmt = strftime(' [%d %b, %Y]', date)
|
|
return soup
|
|
|
|
def parse_todays_page(self):
|
|
soup = self.read_nyt_metadata()
|
|
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
|
script = type(u'')(script)
|
|
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
|
|
data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
|
|
containers, sections = {}, {}
|
|
article_map = {}
|
|
gc_pat = re.compile(r'groupings.(\d+).containers.(\d+)')
|
|
pat = re.compile(r'groupings.(\d+).containers.(\d+).relations.(\d+)')
|
|
for key in data:
|
|
if 'Article' in key:
|
|
adata = data[key]
|
|
if adata.get('__typename') == 'Article':
|
|
url = adata.get('url')
|
|
summary = adata.get('summary')
|
|
headline = adata.get('headline')
|
|
if url and headline:
|
|
title = data[headline['id']]['default']
|
|
article_map[adata['id']] = {
|
|
'title': title, 'url': url, 'description': summary or ''}
|
|
elif 'Legacy' in key:
|
|
sdata = data[key]
|
|
tname = sdata.get('__typename')
|
|
if tname == 'LegacyCollectionContainer':
|
|
m = gc_pat.search(key)
|
|
containers[int(m.group(2))] = sdata['label'] or sdata['name']
|
|
elif tname == 'LegacyCollectionRelation':
|
|
m = pat.search(key)
|
|
grouping, container, relation = map(int, m.groups())
|
|
asset = sdata['asset']
|
|
if asset and asset['typename'] == 'Article' and grouping == 0:
|
|
if container not in sections:
|
|
sections[container] = []
|
|
sections[container].append(asset['id'].split(':', 1)[1])
|
|
|
|
feeds = []
|
|
for container_num in sorted(containers):
|
|
section_title = containers[container_num]
|
|
if container_num in sections:
|
|
articles = sections[container_num]
|
|
if articles:
|
|
feeds.append((section_title, []))
|
|
for artid in articles:
|
|
if artid in article_map:
|
|
art = article_map[artid]
|
|
feeds[-1][1].append(art)
|
|
|
|
def skey(x):
|
|
name = x[0].strip()
|
|
if name == 'The Front Page':
|
|
return 0, ''
|
|
return 1, name.lower()
|
|
feeds.sort(key=skey)
|
|
for section, articles in feeds:
|
|
self.log('\n' + section)
|
|
for article in articles:
|
|
self.log(article['title'] + ' - ' + article['url'])
|
|
# raise SystemExit(1)
|
|
return feeds
|
|
|
|
def parse_article_group(self, container):
|
|
for li in container.findAll('li'):
|
|
article = li.find('article')
|
|
h2 = article.find('h2')
|
|
if h2 is not None:
|
|
title = self.tag_to_string(h2)
|
|
a = h2.find('a', href=True)
|
|
if a is not None:
|
|
url = a['href']
|
|
if url.startswith('/'):
|
|
url = 'https://www.nytimes.com' + url
|
|
desc = ''
|
|
p = h2.findNextSibling('p')
|
|
if p is not None:
|
|
desc = self.tag_to_string(p)
|
|
date = ''
|
|
d = date_from_url(url)
|
|
if d is not None:
|
|
date = format_date(d)
|
|
today = datetime.date.today()
|
|
delta = today - d
|
|
if delta.days > oldest_web_edition_article:
|
|
self.log.debug('\tSkipping article', title, 'as it is too old')
|
|
continue
|
|
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
|
|
|
def parse_web_section(self, soup, slug):
|
|
|
|
def log(article):
|
|
self.log('\t', article['title'] + article['date'], ':', article['url'])
|
|
if article.get('description'):
|
|
self.log('\t\t', article['description'])
|
|
|
|
cid = slug.split('/')[-1]
|
|
if cid == 'dining':
|
|
cid = 'food'
|
|
try:
|
|
container = soup.find(id='collection-{}'.format(cid)).find('section')
|
|
except AttributeError:
|
|
container = None
|
|
if container is None:
|
|
raise ValueError('Failed to find articles container for slug: {}'.format(slug))
|
|
for ol in container.findAll('ol'):
|
|
for article in self.parse_article_group(ol):
|
|
log(article)
|
|
yield article
|
|
|
|
def parse_web_sections(self):
|
|
self.read_nyt_metadata()
|
|
feeds = []
|
|
for section_title, slug in web_sections:
|
|
url = 'https://www.nytimes.com/section/' + slug
|
|
try:
|
|
soup = self.index_to_soup(self.get_nyt_page(url))
|
|
except Exception:
|
|
self.log.error('Failed to download section:', url)
|
|
continue
|
|
self.log('Found section:', section_title)
|
|
articles = list(self.parse_web_section(soup, slug))
|
|
if articles:
|
|
feeds.append((section_title, articles))
|
|
if self.test and len(feeds) >= self.test[0]:
|
|
break
|
|
return feeds
|
|
|
|
def parse_index(self):
|
|
# return [('All articles', [
|
|
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
|
# ])]
|
|
if is_web_edition:
|
|
return self.parse_web_sections()
|
|
return self.parse_todays_page()
|