mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update NYTimes Web Edition for website changes
This commit is contained in:
parent
1bf1265b1b
commit
6cdd57289b
@ -8,13 +8,20 @@ import datetime
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import mechanize
|
||||||
|
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
from calibre.utils.date import strptime
|
from calibre.utils.date import strptime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from polyglot.urllib import urlencode
|
||||||
|
|
||||||
use_wayback_machine = False
|
use_wayback_machine = False
|
||||||
|
|
||||||
|
# This is an Apollo persisted query hash which you can get
|
||||||
|
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
|
||||||
|
# or by https://www.nytimes.com/section/world
|
||||||
|
persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7'
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
# the section you are not interested in
|
# the section you are not interested in
|
||||||
@ -76,7 +83,7 @@ def new_tag(soup, name, attrs=()):
|
|||||||
class NewYorkTimes(BasicNewsRecipe):
|
class NewYorkTimes(BasicNewsRecipe):
|
||||||
title = 'The New York Times (Web)'
|
title = 'The New York Times (Web)'
|
||||||
description = (
|
description = (
|
||||||
'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
|
'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
|
||||||
'Use advanced menu to make changes to fetch Todays Paper'
|
'Use advanced menu to make changes to fetch Todays Paper'
|
||||||
)
|
)
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
@ -169,169 +176,83 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.compress_news_images = True
|
self.compress_news_images = True
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
pdate = self.recipe_specific_options.get('date')
|
||||||
# INDEX = 'file:///t/raw.html'
|
templ = 'https://www.nytimes.com/issue/todayspaper/{}/todays-new-york-times'
|
||||||
d = self.recipe_specific_options.get('date')
|
if pdate and isinstance(pdate, str):
|
||||||
if d and isinstance(d, str):
|
return pdate, self.index_to_soup(templ.format(pdate))
|
||||||
INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
|
# Cant figure out how to get the date so just try todays and yesterdays dates
|
||||||
return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
|
date = datetime.date.today()
|
||||||
|
pdate = date.strftime('%Y/%m/%d')
|
||||||
|
try:
|
||||||
|
soup = self.index_to_soup(templ.format(pdate))
|
||||||
|
except Exception as e:
|
||||||
|
if getattr(e, 'code', None) == 404:
|
||||||
|
date -= datetime.timedelta(days=1)
|
||||||
|
pdate = date.strftime('%Y/%m/%d')
|
||||||
|
soup = self.index_to_soup(templ.format(pdate))
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
self.log("Using today's paper from:", pdate)
|
||||||
|
return pdate, soup
|
||||||
|
|
||||||
def read_nyt_metadata(self):
|
def read_nyt_metadata(self):
|
||||||
soup = self.read_todays_paper()
|
pdate, soup = self.read_todays_paper()
|
||||||
pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content']
|
date = strptime(pdate, '%Y/%m/%d', assume_utc=False, as_utc=False)
|
||||||
date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False)
|
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
|
||||||
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d'))
|
|
||||||
self.timefmt = strftime(' [%d %b, %Y]', date)
|
self.timefmt = strftime(' [%d %b, %Y]', date)
|
||||||
return soup
|
self.nytimes_publication_date = pdate
|
||||||
|
|
||||||
def parse_todays_page(self):
|
|
||||||
soup = self.read_nyt_metadata()
|
|
||||||
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
script = soup.findAll('script', text=lambda x: x and 'window.__preloadedData' in x)[0]
|
||||||
script = type(u'')(script)
|
script = type(u'')(script)
|
||||||
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';')
|
json_data = script[script.find('{'):script.rfind(';')].strip().rstrip(';') # }}
|
||||||
data = json.loads(json_data.replace(':undefined', ':null'))['initialState']
|
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
|
||||||
article_map = {}
|
return soup
|
||||||
sections = []
|
|
||||||
for key in data:
|
|
||||||
if 'Article' in key:
|
|
||||||
adata = data[key]
|
|
||||||
if adata.get('__typename') == 'Article':
|
|
||||||
url = adata.get('url')
|
|
||||||
summary = adata.get('summary')
|
|
||||||
headline = adata.get('headline')
|
|
||||||
if url and headline:
|
|
||||||
title = headline['default']
|
|
||||||
article_map[adata['id']] = {
|
|
||||||
'title': title, 'url': url, 'description': summary or ''}
|
|
||||||
elif 'LegacyCollection:' in key:
|
|
||||||
lc = data[key]
|
|
||||||
if not lc.get('active'):
|
|
||||||
continue
|
|
||||||
for sdata in lc['groupings']:
|
|
||||||
tname = sdata.get('__typename')
|
|
||||||
if tname != 'LegacyCollectionGrouping':
|
|
||||||
continue
|
|
||||||
for cont in sdata['containers']:
|
|
||||||
if cont.get('__typename') == 'LegacyCollectionContainer':
|
|
||||||
section_name = cont['label@stripHtml']
|
|
||||||
articles = []
|
|
||||||
for rel in cont['relations']:
|
|
||||||
if rel.get('__typename') == 'LegacyCollectionRelation':
|
|
||||||
asset = rel['asset']['__ref']
|
|
||||||
if asset.startswith('Article:'):
|
|
||||||
articles.append(asset.partition(':')[2])
|
|
||||||
if articles:
|
|
||||||
sections.append((section_name, articles))
|
|
||||||
|
|
||||||
feeds = []
|
def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
|
||||||
for section_title, article_ids in sections:
|
query = {
|
||||||
articles = []
|
'operationName': operationName,
|
||||||
for aid in article_ids:
|
'variables': json.dumps({
|
||||||
if aid in article_map:
|
'id': qid,
|
||||||
art = article_map[aid]
|
'first': 10,
|
||||||
articles.append(art)
|
'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
|
||||||
if articles:
|
'isFetchMore':False,
|
||||||
feeds.append((section_title, articles))
|
'isTranslatable':False,
|
||||||
|
'isEspanol':False,
|
||||||
|
'highlightsListUri':'nyt://per/personalized-list/__null__',
|
||||||
|
'highlightsListFirst':0,
|
||||||
|
'hasHighlightsList':False
|
||||||
|
}, separators=',:'),
|
||||||
|
'extensions': json.dumps({
|
||||||
|
'persistedQuery': {
|
||||||
|
'version':1,
|
||||||
|
'sha256Hash': persistedQuery,
|
||||||
|
},
|
||||||
|
}, separators=',:')
|
||||||
|
}
|
||||||
|
url = self.nytimes_graphql_config['gqlUrlClient'] + '?' + urlencode(query)
|
||||||
|
br = self.browser
|
||||||
|
# br.set_debug_http(True)
|
||||||
|
headers = dict(self.nytimes_graphql_config['gqlRequestHeaders'])
|
||||||
|
headers['Accept'] = 'application/json'
|
||||||
|
req = mechanize.Request(url, headers=headers)
|
||||||
|
raw = br.open(req).read()
|
||||||
|
# open('/t/raw.json', 'wb').write(raw)
|
||||||
|
return json.loads(raw)
|
||||||
|
|
||||||
def skey(x):
|
def parse_todays_page(self):
|
||||||
name = x[0].strip()
|
self.read_nyt_metadata()
|
||||||
if name == 'The Front Page':
|
query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date)
|
||||||
return 0, ''
|
data = self.nyt_graphql_query(query_id)
|
||||||
return 1, name.lower()
|
return parse_todays_page(data, self.log)
|
||||||
feeds.sort(key=skey)
|
|
||||||
for section, articles in feeds:
|
|
||||||
self.log('\n' + section)
|
|
||||||
for article in articles:
|
|
||||||
self.log(article['title'] + ' - ' + article['url'])
|
|
||||||
# raise SystemExit(1)
|
|
||||||
return feeds
|
|
||||||
|
|
||||||
def parse_article_group(self, container):
|
|
||||||
for li in container.findAll('li'):
|
|
||||||
article = li.find('article')
|
|
||||||
if article is None:
|
|
||||||
a = li.find('a', href=True)
|
|
||||||
if a is not None:
|
|
||||||
title = self.tag_to_string(li.find(['h3', 'h2'])).strip()
|
|
||||||
paras = li.findAll('p')
|
|
||||||
if not title:
|
|
||||||
title = self.tag_to_string(paras[0]).strip()
|
|
||||||
if not title:
|
|
||||||
raise ValueError('No title found in article')
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.nytimes.com' + url
|
|
||||||
desc = ''
|
|
||||||
if len(paras) > 0:
|
|
||||||
desc = self.tag_to_string(paras[-1])
|
|
||||||
date = ''
|
|
||||||
d = date_from_url(url)
|
|
||||||
if d is not None:
|
|
||||||
date = format_date(d)
|
|
||||||
today = datetime.date.today()
|
|
||||||
delta = today - d
|
|
||||||
if delta.days > self.oldest_web_edition_article:
|
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
|
||||||
continue
|
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
|
||||||
continue
|
|
||||||
h2 = article.find(['h2', 'h3'])
|
|
||||||
if h2 is not None:
|
|
||||||
title = self.tag_to_string(h2)
|
|
||||||
a = h2.find('a', href=True)
|
|
||||||
if a is not None:
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.nytimes.com' + url
|
|
||||||
desc = ''
|
|
||||||
p = h2.findNextSibling('p')
|
|
||||||
if p is not None:
|
|
||||||
desc = self.tag_to_string(p)
|
|
||||||
date = ''
|
|
||||||
d = date_from_url(url)
|
|
||||||
if d is not None:
|
|
||||||
date = format_date(d)
|
|
||||||
today = datetime.date.today()
|
|
||||||
delta = today - d
|
|
||||||
if delta.days > self.oldest_web_edition_article:
|
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
|
||||||
continue
|
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
|
||||||
|
|
||||||
def parse_web_section(self, soup, slug):
|
|
||||||
|
|
||||||
def log(article):
|
|
||||||
self.log('\t', article['title'] + article['date'], ':', article['url'])
|
|
||||||
if article.get('description'):
|
|
||||||
self.log('\t\t', article['description'])
|
|
||||||
|
|
||||||
cid = slug.split('/')[-1]
|
|
||||||
if cid == 'dining':
|
|
||||||
cid = 'food'
|
|
||||||
try:
|
|
||||||
container = soup.find(id='collection-{}'.format(cid)).find('section')
|
|
||||||
except AttributeError:
|
|
||||||
container = None
|
|
||||||
if container is None:
|
|
||||||
raise ValueError('Failed to find articles container for slug: {}'.format(slug))
|
|
||||||
for ol in container.findAll('ol'):
|
|
||||||
for article in self.parse_article_group(ol):
|
|
||||||
log(article)
|
|
||||||
yield article
|
|
||||||
|
|
||||||
def parse_web_sections(self):
|
def parse_web_sections(self):
|
||||||
self.read_nyt_metadata()
|
self.read_nyt_metadata()
|
||||||
feeds = []
|
feeds = []
|
||||||
for section_title, slug in web_sections:
|
for section_title, slug in web_sections:
|
||||||
url = 'https://www.nytimes.com/section/' + slug
|
query_id = '/section/' + slug
|
||||||
try:
|
data = self.nyt_graphql_query(query_id)
|
||||||
soup = self.index_to_soup(self.get_nyt_page(url))
|
articles = parse_web_section(data)
|
||||||
except Exception:
|
|
||||||
self.log.error('Failed to download section:', url)
|
|
||||||
continue
|
|
||||||
self.log('Found section:', section_title)
|
|
||||||
articles = list(self.parse_web_section(soup, slug))
|
|
||||||
if articles:
|
if articles:
|
||||||
|
self.log('Found section:', section_title)
|
||||||
feeds.append((section_title, articles))
|
feeds.append((section_title, articles))
|
||||||
if self.test and len(feeds) >= self.test[0]:
|
if self.test and len(feeds) >= self.test[0]:
|
||||||
break
|
break
|
||||||
@ -372,3 +293,47 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
if not re.search(r'/video/|/athletic/|/card/', url):
|
if not re.search(r'/video/|/athletic/|/card/', url):
|
||||||
return url
|
return url
|
||||||
self.log('\tSkipping ', url)
|
self.log('\tSkipping ', url)
|
||||||
|
|
||||||
|
|
||||||
|
def asset_to_article(asset):
|
||||||
|
title = asset['headline']['default']
|
||||||
|
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_todays_page(data, log=print):
|
||||||
|
containers = data['data']['legacyCollection']['groupings'][0]['containers']
|
||||||
|
feeds = []
|
||||||
|
for cont in containers:
|
||||||
|
if cont['__typename'] != 'LegacyCollectionContainer':
|
||||||
|
continue
|
||||||
|
section_name = cont['label'].strip()
|
||||||
|
if not section_name:
|
||||||
|
continue
|
||||||
|
log(section_name)
|
||||||
|
articles = []
|
||||||
|
for rel in cont['relations']:
|
||||||
|
if rel.get('__typename') == 'LegacyCollectionRelation':
|
||||||
|
asset = rel['asset']
|
||||||
|
if asset['__typename'] == 'Article':
|
||||||
|
articles.append(asset_to_article(asset))
|
||||||
|
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
|
||||||
|
if articles:
|
||||||
|
feeds.append((section_name, articles))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
def parse_web_section(data, log=print):
|
||||||
|
articles = []
|
||||||
|
containers = data['data']['legacyCollection']['collectionsPage']['embeddedCollections']
|
||||||
|
for cont in containers:
|
||||||
|
for s in cont['stream']['edges']:
|
||||||
|
asset = s['node']
|
||||||
|
if asset['__typename'] == 'Article':
|
||||||
|
articles.append(asset_to_article(asset))
|
||||||
|
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
parse_web_section(json.loads(open(sys.argv[-1], 'rb').read()))
|
||||||
|
@ -18,6 +18,10 @@ from polyglot.urllib import urlencode
|
|||||||
|
|
||||||
use_wayback_machine = False
|
use_wayback_machine = False
|
||||||
|
|
||||||
|
# This is an Apollo persisted query hash which you can get
|
||||||
|
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
|
||||||
|
# or by https://www.nytimes.com/section/world
|
||||||
|
persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7'
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
# the section you are not interested in
|
# the section you are not interested in
|
||||||
@ -203,12 +207,11 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
|
self.nytimes_graphql_config = json.loads(json_data.replace(':undefined', ':null'))['config']
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def parse_todays_page(self):
|
def nyt_graphql_query(self, qid, operationName='CollectionsQuery'):
|
||||||
self.read_nyt_metadata()
|
|
||||||
query = {
|
query = {
|
||||||
'operationName': 'CollectionsQuery',
|
'operationName': operationName,
|
||||||
'variables': json.dumps({
|
'variables': json.dumps({
|
||||||
'id': '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date),
|
'id': qid,
|
||||||
'first': 10,
|
'first': 10,
|
||||||
'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
|
'exclusionMode': 'HIGHLIGHTS_AND_EMBEDDED',
|
||||||
'isFetchMore':False,
|
'isFetchMore':False,
|
||||||
@ -221,9 +224,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
'extensions': json.dumps({
|
'extensions': json.dumps({
|
||||||
'persistedQuery': {
|
'persistedQuery': {
|
||||||
'version':1,
|
'version':1,
|
||||||
# This is an Apollo persisted query hash which you can get
|
'sha256Hash': persistedQuery,
|
||||||
# from looking at the XHR requests made by: https://www.nytimes.com/section/todayspaper
|
|
||||||
'sha256Hash': '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fef7',
|
|
||||||
},
|
},
|
||||||
}, separators=',:')
|
}, separators=',:')
|
||||||
}
|
}
|
||||||
@ -235,94 +236,23 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
req = mechanize.Request(url, headers=headers)
|
req = mechanize.Request(url, headers=headers)
|
||||||
raw = br.open(req).read()
|
raw = br.open(req).read()
|
||||||
# open('/t/raw.json', 'wb').write(raw)
|
# open('/t/raw.json', 'wb').write(raw)
|
||||||
return parse_todays_page(json.loads(raw), self.log)
|
return json.loads(raw)
|
||||||
|
|
||||||
def parse_article_group(self, container):
|
def parse_todays_page(self):
|
||||||
for li in container.findAll('li'):
|
self.read_nyt_metadata()
|
||||||
article = li.find('article')
|
query_id = '/issue/todayspaper/{}/todays-new-york-times'.format(self.nytimes_publication_date)
|
||||||
if article is None:
|
data = self.nyt_graphql_query(query_id)
|
||||||
a = li.find('a', href=True)
|
return parse_todays_page(data, self.log)
|
||||||
if a is not None:
|
|
||||||
title = self.tag_to_string(li.find(['h3', 'h2'])).strip()
|
|
||||||
paras = li.findAll('p')
|
|
||||||
if not title:
|
|
||||||
title = self.tag_to_string(paras[0]).strip()
|
|
||||||
if not title:
|
|
||||||
raise ValueError('No title found in article')
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.nytimes.com' + url
|
|
||||||
desc = ''
|
|
||||||
if len(paras) > 0:
|
|
||||||
desc = self.tag_to_string(paras[-1])
|
|
||||||
date = ''
|
|
||||||
d = date_from_url(url)
|
|
||||||
if d is not None:
|
|
||||||
date = format_date(d)
|
|
||||||
today = datetime.date.today()
|
|
||||||
delta = today - d
|
|
||||||
if delta.days > self.oldest_web_edition_article:
|
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
|
||||||
continue
|
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
|
||||||
continue
|
|
||||||
h2 = article.find(['h2', 'h3'])
|
|
||||||
if h2 is not None:
|
|
||||||
title = self.tag_to_string(h2)
|
|
||||||
a = h2.find('a', href=True)
|
|
||||||
if a is not None:
|
|
||||||
url = a['href']
|
|
||||||
if url.startswith('/'):
|
|
||||||
url = 'https://www.nytimes.com' + url
|
|
||||||
desc = ''
|
|
||||||
p = h2.findNextSibling('p')
|
|
||||||
if p is not None:
|
|
||||||
desc = self.tag_to_string(p)
|
|
||||||
date = ''
|
|
||||||
d = date_from_url(url)
|
|
||||||
if d is not None:
|
|
||||||
date = format_date(d)
|
|
||||||
today = datetime.date.today()
|
|
||||||
delta = today - d
|
|
||||||
if delta.days > self.oldest_web_edition_article:
|
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
|
||||||
continue
|
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
|
||||||
|
|
||||||
def parse_web_section(self, soup, slug):
|
|
||||||
|
|
||||||
def log(article):
|
|
||||||
self.log('\t', article['title'] + article['date'], ':', article['url'])
|
|
||||||
if article.get('description'):
|
|
||||||
self.log('\t\t', article['description'])
|
|
||||||
|
|
||||||
cid = slug.split('/')[-1]
|
|
||||||
if cid == 'dining':
|
|
||||||
cid = 'food'
|
|
||||||
try:
|
|
||||||
container = soup.find(id='collection-{}'.format(cid)).find('section')
|
|
||||||
except AttributeError:
|
|
||||||
container = None
|
|
||||||
if container is None:
|
|
||||||
raise ValueError('Failed to find articles container for slug: {}'.format(slug))
|
|
||||||
for ol in container.findAll('ol'):
|
|
||||||
for article in self.parse_article_group(ol):
|
|
||||||
log(article)
|
|
||||||
yield article
|
|
||||||
|
|
||||||
def parse_web_sections(self):
|
def parse_web_sections(self):
|
||||||
self.read_nyt_metadata()
|
self.read_nyt_metadata()
|
||||||
feeds = []
|
feeds = []
|
||||||
for section_title, slug in web_sections:
|
for section_title, slug in web_sections:
|
||||||
url = 'https://www.nytimes.com/section/' + slug
|
query_id = '/section/' + slug
|
||||||
try:
|
data = self.nyt_graphql_query(query_id)
|
||||||
soup = self.index_to_soup(self.get_nyt_page(url))
|
articles = parse_web_section(data)
|
||||||
except Exception:
|
|
||||||
self.log.error('Failed to download section:', url)
|
|
||||||
continue
|
|
||||||
self.log('Found section:', section_title)
|
|
||||||
articles = list(self.parse_web_section(soup, slug))
|
|
||||||
if articles:
|
if articles:
|
||||||
|
self.log('Found section:', section_title)
|
||||||
feeds.append((section_title, articles))
|
feeds.append((section_title, articles))
|
||||||
if self.test and len(feeds) >= self.test[0]:
|
if self.test and len(feeds) >= self.test[0]:
|
||||||
break
|
break
|
||||||
@ -365,6 +295,11 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.log('\tSkipping ', url)
|
self.log('\tSkipping ', url)
|
||||||
|
|
||||||
|
|
||||||
|
def asset_to_article(asset):
|
||||||
|
title = asset['headline']['default']
|
||||||
|
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_page(data, log=print):
|
def parse_todays_page(data, log=print):
|
||||||
containers = data['data']['legacyCollection']['groupings'][0]['containers']
|
containers = data['data']['legacyCollection']['groupings'][0]['containers']
|
||||||
feeds = []
|
feeds = []
|
||||||
@ -380,14 +315,25 @@ def parse_todays_page(data, log=print):
|
|||||||
if rel.get('__typename') == 'LegacyCollectionRelation':
|
if rel.get('__typename') == 'LegacyCollectionRelation':
|
||||||
asset = rel['asset']
|
asset = rel['asset']
|
||||||
if asset['__typename'] == 'Article':
|
if asset['__typename'] == 'Article':
|
||||||
title = asset['headline']['default']
|
articles.append(asset_to_article(asset))
|
||||||
articles.append({'title': title, 'url': asset['url'], 'description': asset['summary']})
|
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
|
||||||
log(' ', title + ':', asset['url'])
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((section_name, articles))
|
feeds.append((section_name, articles))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
|
|
||||||
|
def parse_web_section(data, log=print):
|
||||||
|
articles = []
|
||||||
|
containers = data['data']['legacyCollection']['collectionsPage']['embeddedCollections']
|
||||||
|
for cont in containers:
|
||||||
|
for s in cont['stream']['edges']:
|
||||||
|
asset = s['node']
|
||||||
|
if asset['__typename'] == 'Article':
|
||||||
|
articles.append(asset_to_article(asset))
|
||||||
|
log(' ', articles[-1]['title'] + ':', articles[-1]['url'])
|
||||||
|
return articles
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
parse_todays_page(json.loads(open(sys.argv[-1], 'rb').read()))
|
parse_todays_page(json.loads(open(sys.argv[-1], 'rb').read()))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user