mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-10-24 23:38:55 -04:00
Use JSON data for nytimes web sections as well
This commit is contained in:
parent
99f8c3cfec
commit
b43f02fc82
@ -23,32 +23,31 @@ persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fe
|
|||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
# the section you are not interested in
|
# the section you are not interested in
|
||||||
web_sections = [
|
web_sections = [
|
||||||
('World', 'world'),
|
'world',
|
||||||
('U.S.', 'us'),
|
'us',
|
||||||
('Politics', 'politics'),
|
'politics',
|
||||||
('New York', 'nyregion'),
|
'nyregion',
|
||||||
('Business', 'business'),
|
'business',
|
||||||
('Technology', 'technology'),
|
'technology',
|
||||||
('Sports', 'sports'),
|
'sports',
|
||||||
('Science', 'science'),
|
'science',
|
||||||
('Health', 'health'),
|
'health',
|
||||||
('Opinion', 'opinion'),
|
'opinion',
|
||||||
('Arts', 'arts'),
|
'arts',
|
||||||
# ('Books', 'books'),
|
'books',
|
||||||
('Movies', 'movies'),
|
'movies',
|
||||||
('Music', 'arts/music'),
|
'arts/music',
|
||||||
('Television', 'arts/television'),
|
'arts/television',
|
||||||
('Style', 'style'),
|
'style',
|
||||||
('Dining & Wine', 'food'),
|
'food',
|
||||||
('Fashion & Style', 'fashion'),
|
'fashion',
|
||||||
# ('Home & Garden', 'garden'),
|
'travel',
|
||||||
('Travel', 'travel'),
|
'education',
|
||||||
('Education', 'education'),
|
'multimedia',
|
||||||
('Multimedia', 'multimedia'),
|
'obituaries',
|
||||||
('Obituaries', 'obituaries'),
|
'magazine',
|
||||||
('Sunday Magazine', 'magazine')
|
|
||||||
]
|
]
|
||||||
# web_sections = [ ('Business', 'business'), ]
|
# web_sections = [ 'business' ]
|
||||||
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
|
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
|
||||||
|
|
||||||
|
|
||||||
@ -200,14 +199,14 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_web_sections(self):
|
def parse_web_sections(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
for section_title, slug in web_sections:
|
for slug in web_sections:
|
||||||
url = 'https://www.nytimes.com/section/' + slug
|
url = 'https://www.nytimes.com/section/' + slug
|
||||||
self.log('Download section index:', url)
|
self.log('Download section index:', url)
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
# with open('/t/raw.html', 'w') as f:
|
# with open('/t/raw.html', 'w') as f:
|
||||||
# f.write(str(soup))
|
# f.write(str(soup))
|
||||||
|
section_title, articles = parse_web_section(soup)
|
||||||
self.log('Section:', section_title)
|
self.log('Section:', section_title)
|
||||||
articles = parse_web_section(soup)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((section_title, articles))
|
feeds.append((section_title, articles))
|
||||||
for a in articles:
|
for a in articles:
|
||||||
@ -222,16 +221,16 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
# return [('All articles', [
|
# return [('All articles', [
|
||||||
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
||||||
# ])]
|
# ])]
|
||||||
|
date, feeds = self.parse_todays_page()
|
||||||
|
pdate = date.strftime('%Y/%m/%d')
|
||||||
|
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
|
||||||
|
self.timefmt = strftime(' [%d %b, %Y]', date)
|
||||||
if self.is_web_edition:
|
if self.is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
date, feeds = self.parse_todays_page()
|
|
||||||
for s, articles in feeds:
|
for s, articles in feeds:
|
||||||
self.log('Section:', s)
|
self.log('Section:', s)
|
||||||
for a in articles:
|
for a in articles:
|
||||||
self.log('\t', a['title'], a['url'])
|
self.log('\t', a['title'], a['url'])
|
||||||
pdate = date.strftime('%Y/%m/%d')
|
|
||||||
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
|
|
||||||
self.timefmt = strftime(' [%d %b, %Y]', date)
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def get_browser(self, *args, **kwargs):
|
def get_browser(self, *args, **kwargs):
|
||||||
@ -259,39 +258,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.log('\tSkipping ', url)
|
self.log('\tSkipping ', url)
|
||||||
|
|
||||||
|
|
||||||
def parse_web_section(soup):
|
def preloaded_data(soup):
|
||||||
seen = set()
|
|
||||||
ans = []
|
|
||||||
|
|
||||||
def handle_h3(h3):
|
|
||||||
if h3.parent.name == 'a':
|
|
||||||
href = h3.parent['href']
|
|
||||||
parent = h3.parent.parent
|
|
||||||
else:
|
|
||||||
href = h3.find('a')['href']
|
|
||||||
parent = h3.parent
|
|
||||||
if href.startswith('/video/') or href in seen:
|
|
||||||
return
|
|
||||||
seen.add(href)
|
|
||||||
title = h3.get_text(separator=' ', strip=True)
|
|
||||||
desc = ''
|
|
||||||
for p in parent.find_all('p'):
|
|
||||||
desc += p.get_text(separator=' ', strip=True)
|
|
||||||
ans.append({'title': title, 'url': absolutize_href(href), 'description': desc})
|
|
||||||
|
|
||||||
tuple(map(handle_h3, soup.find(id='collection-highlights-container').find_all('h3')))
|
|
||||||
tuple(map(handle_h3, soup.find(attrs={'data-testid': 'main-collection'}).find_all('h3')))
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
def asset_to_article(asset):
|
|
||||||
title = asset['headline']['default']
|
|
||||||
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
|
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_page(soup):
|
|
||||||
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
|
|
||||||
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
|
|
||||||
from calibre.web.site_parsers.nytimes import clean_js_json
|
from calibre.web.site_parsers.nytimes import clean_js_json
|
||||||
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
|
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
|
||||||
script = candidates[0]
|
script = candidates[0]
|
||||||
@ -300,14 +267,53 @@ def parse_todays_page(soup):
|
|||||||
raw = clean_js_json(raw)
|
raw = clean_js_json(raw)
|
||||||
# with open('/t/raw.json', 'w') as f:
|
# with open('/t/raw.json', 'w') as f:
|
||||||
# f.write(raw)
|
# f.write(raw)
|
||||||
data = json.loads(raw)['initialState']
|
return json.loads(raw)['initialState']
|
||||||
|
|
||||||
|
|
||||||
|
def asset_to_article(asset):
|
||||||
|
title = asset['headline']['default']
|
||||||
|
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_web_section(soup):
|
||||||
|
data = preloaded_data(soup)
|
||||||
article_map = {}
|
article_map = {}
|
||||||
|
for k, v in data.items():
|
||||||
|
if v['__typename'] == 'Article':
|
||||||
|
article_map[k] = asset_to_article(v)
|
||||||
|
articles = []
|
||||||
|
for k, v in data['ROOT_QUERY'].items():
|
||||||
|
if k.startswith('workOrLocation'):
|
||||||
|
c = data[v['__ref']]
|
||||||
|
section_title = c['name']
|
||||||
|
for k, v in c['collectionsPage'].items():
|
||||||
|
if k.startswith('stream'):
|
||||||
|
for k, v in v.items():
|
||||||
|
if k.startswith('edges'):
|
||||||
|
for q in v:
|
||||||
|
r = q['node']['__ref']
|
||||||
|
if r.startswith('Article:'):
|
||||||
|
articles.append(article_map[r])
|
||||||
|
if not articles:
|
||||||
|
for c in c['collectionsPage']['embeddedCollections']:
|
||||||
|
for e in c['stream']['edges']:
|
||||||
|
for k, v in e.items():
|
||||||
|
if k.startswith('node'):
|
||||||
|
articles.append(article_map[v['__ref']])
|
||||||
|
return section_title, articles
|
||||||
|
|
||||||
|
|
||||||
|
def parse_todays_page(soup):
|
||||||
|
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
|
||||||
|
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
|
||||||
|
article_map = {}
|
||||||
|
data = preloaded_data(soup)
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
if v['__typename'] == 'Article':
|
if v['__typename'] == 'Article':
|
||||||
article_map[k] = asset_to_article(v)
|
article_map[k] = asset_to_article(v)
|
||||||
feeds = []
|
feeds = []
|
||||||
for v in data['ROOT_QUERY'].values():
|
for k, v in data['ROOT_QUERY'].items():
|
||||||
if isinstance(v, dict):
|
if k.startswith('workOrLocation'):
|
||||||
for g in data[v['__ref']]['groupings']:
|
for g in data[v['__ref']]['groupings']:
|
||||||
for c in g['containers']:
|
for c in g['containers']:
|
||||||
articles = []
|
articles = []
|
||||||
@ -326,7 +332,9 @@ if __name__ == '__main__':
|
|||||||
html = f.read()
|
html = f.read()
|
||||||
soup = BeautifulSoup(html)
|
soup = BeautifulSoup(html)
|
||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
pprint(parse_web_section(soup))
|
section_title, articles = parse_web_section(soup)
|
||||||
|
print(section_title)
|
||||||
|
pprint(articles)
|
||||||
else:
|
else:
|
||||||
pdate, feeds = parse_todays_page(soup)
|
pdate, feeds = parse_todays_page(soup)
|
||||||
print(pdate)
|
print(pdate)
|
||||||
|
|||||||
@ -23,32 +23,31 @@ persistedQuery = '1f99120a11e94dd62a9474f68ee1255537ee3cf7eac20a0377819edb2fa1fe
|
|||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
# the section you are not interested in
|
# the section you are not interested in
|
||||||
web_sections = [
|
web_sections = [
|
||||||
('World', 'world'),
|
'world',
|
||||||
('U.S.', 'us'),
|
'us',
|
||||||
('Politics', 'politics'),
|
'politics',
|
||||||
('New York', 'nyregion'),
|
'nyregion',
|
||||||
('Business', 'business'),
|
'business',
|
||||||
('Technology', 'technology'),
|
'technology',
|
||||||
('Sports', 'sports'),
|
'sports',
|
||||||
('Science', 'science'),
|
'science',
|
||||||
('Health', 'health'),
|
'health',
|
||||||
('Opinion', 'opinion'),
|
'opinion',
|
||||||
('Arts', 'arts'),
|
'arts',
|
||||||
# ('Books', 'books'),
|
'books',
|
||||||
('Movies', 'movies'),
|
'movies',
|
||||||
('Music', 'arts/music'),
|
'arts/music',
|
||||||
('Television', 'arts/television'),
|
'arts/television',
|
||||||
('Style', 'style'),
|
'style',
|
||||||
('Dining & Wine', 'food'),
|
'food',
|
||||||
('Fashion & Style', 'fashion'),
|
'fashion',
|
||||||
# ('Home & Garden', 'garden'),
|
'travel',
|
||||||
('Travel', 'travel'),
|
'education',
|
||||||
('Education', 'education'),
|
'multimedia',
|
||||||
('Multimedia', 'multimedia'),
|
'obituaries',
|
||||||
('Obituaries', 'obituaries'),
|
'magazine',
|
||||||
('Sunday Magazine', 'magazine')
|
|
||||||
]
|
]
|
||||||
# web_sections = [ ('Business', 'business'), ]
|
# web_sections = [ 'business' ]
|
||||||
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
|
url_date_pat = re.compile(r'/(2\d\d\d)/(\d\d)/(\d\d)/')
|
||||||
|
|
||||||
|
|
||||||
@ -200,14 +199,14 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_web_sections(self):
|
def parse_web_sections(self):
|
||||||
feeds = []
|
feeds = []
|
||||||
for section_title, slug in web_sections:
|
for slug in web_sections:
|
||||||
url = 'https://www.nytimes.com/section/' + slug
|
url = 'https://www.nytimes.com/section/' + slug
|
||||||
self.log('Download section index:', url)
|
self.log('Download section index:', url)
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(url)
|
||||||
# with open('/t/raw.html', 'w') as f:
|
# with open('/t/raw.html', 'w') as f:
|
||||||
# f.write(str(soup))
|
# f.write(str(soup))
|
||||||
|
section_title, articles = parse_web_section(soup)
|
||||||
self.log('Section:', section_title)
|
self.log('Section:', section_title)
|
||||||
articles = parse_web_section(soup)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((section_title, articles))
|
feeds.append((section_title, articles))
|
||||||
for a in articles:
|
for a in articles:
|
||||||
@ -222,16 +221,16 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
# return [('All articles', [
|
# return [('All articles', [
|
||||||
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
||||||
# ])]
|
# ])]
|
||||||
|
date, feeds = self.parse_todays_page()
|
||||||
|
pdate = date.strftime('%Y/%m/%d')
|
||||||
|
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
|
||||||
|
self.timefmt = strftime(' [%d %b, %Y]', date)
|
||||||
if self.is_web_edition:
|
if self.is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
date, feeds = self.parse_todays_page()
|
|
||||||
for s, articles in feeds:
|
for s, articles in feeds:
|
||||||
self.log('Section:', s)
|
self.log('Section:', s)
|
||||||
for a in articles:
|
for a in articles:
|
||||||
self.log('\t', a['title'], a['url'])
|
self.log('\t', a['title'], a['url'])
|
||||||
pdate = date.strftime('%Y/%m/%d')
|
|
||||||
self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(pdate)
|
|
||||||
self.timefmt = strftime(' [%d %b, %Y]', date)
|
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def get_browser(self, *args, **kwargs):
|
def get_browser(self, *args, **kwargs):
|
||||||
@ -259,39 +258,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.log('\tSkipping ', url)
|
self.log('\tSkipping ', url)
|
||||||
|
|
||||||
|
|
||||||
def parse_web_section(soup):
|
def preloaded_data(soup):
|
||||||
seen = set()
|
|
||||||
ans = []
|
|
||||||
|
|
||||||
def handle_h3(h3):
|
|
||||||
if h3.parent.name == 'a':
|
|
||||||
href = h3.parent['href']
|
|
||||||
parent = h3.parent.parent
|
|
||||||
else:
|
|
||||||
href = h3.find('a')['href']
|
|
||||||
parent = h3.parent
|
|
||||||
if href.startswith('/video/') or href in seen:
|
|
||||||
return
|
|
||||||
seen.add(href)
|
|
||||||
title = h3.get_text(separator=' ', strip=True)
|
|
||||||
desc = ''
|
|
||||||
for p in parent.find_all('p'):
|
|
||||||
desc += p.get_text(separator=' ', strip=True)
|
|
||||||
ans.append({'title': title, 'url': absolutize_href(href), 'description': desc})
|
|
||||||
|
|
||||||
tuple(map(handle_h3, soup.find(id='collection-highlights-container').find_all('h3')))
|
|
||||||
tuple(map(handle_h3, soup.find(attrs={'data-testid': 'main-collection'}).find_all('h3')))
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
def asset_to_article(asset):
|
|
||||||
title = asset['headline']['default']
|
|
||||||
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
|
|
||||||
|
|
||||||
|
|
||||||
def parse_todays_page(soup):
|
|
||||||
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
|
|
||||||
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
|
|
||||||
from calibre.web.site_parsers.nytimes import clean_js_json
|
from calibre.web.site_parsers.nytimes import clean_js_json
|
||||||
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
|
candidates = soup.find_all('script', string=lambda x: x and 'window.__preloadedData' in x)
|
||||||
script = candidates[0]
|
script = candidates[0]
|
||||||
@ -300,14 +267,53 @@ def parse_todays_page(soup):
|
|||||||
raw = clean_js_json(raw)
|
raw = clean_js_json(raw)
|
||||||
# with open('/t/raw.json', 'w') as f:
|
# with open('/t/raw.json', 'w') as f:
|
||||||
# f.write(raw)
|
# f.write(raw)
|
||||||
data = json.loads(raw)['initialState']
|
return json.loads(raw)['initialState']
|
||||||
|
|
||||||
|
|
||||||
|
def asset_to_article(asset):
|
||||||
|
title = asset['headline']['default']
|
||||||
|
return {'title': title, 'url': asset['url'], 'description': asset['summary']}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_web_section(soup):
|
||||||
|
data = preloaded_data(soup)
|
||||||
article_map = {}
|
article_map = {}
|
||||||
|
for k, v in data.items():
|
||||||
|
if v['__typename'] == 'Article':
|
||||||
|
article_map[k] = asset_to_article(v)
|
||||||
|
articles = []
|
||||||
|
for k, v in data['ROOT_QUERY'].items():
|
||||||
|
if k.startswith('workOrLocation'):
|
||||||
|
c = data[v['__ref']]
|
||||||
|
section_title = c['name']
|
||||||
|
for k, v in c['collectionsPage'].items():
|
||||||
|
if k.startswith('stream'):
|
||||||
|
for k, v in v.items():
|
||||||
|
if k.startswith('edges'):
|
||||||
|
for q in v:
|
||||||
|
r = q['node']['__ref']
|
||||||
|
if r.startswith('Article:'):
|
||||||
|
articles.append(article_map[r])
|
||||||
|
if not articles:
|
||||||
|
for c in c['collectionsPage']['embeddedCollections']:
|
||||||
|
for e in c['stream']['edges']:
|
||||||
|
for k, v in e.items():
|
||||||
|
if k.startswith('node'):
|
||||||
|
articles.append(article_map[v['__ref']])
|
||||||
|
return section_title, articles
|
||||||
|
|
||||||
|
|
||||||
|
def parse_todays_page(soup):
|
||||||
|
m = soup.find('meta', attrs={'name':'nyt-collection:uri'})['content'].split('/')
|
||||||
|
pdate = strptime('{}/{}/{}'.format(*m[-4:-1]), '%Y/%m/%d', assume_utc=False, as_utc=False)
|
||||||
|
article_map = {}
|
||||||
|
data = preloaded_data(soup)
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
if v['__typename'] == 'Article':
|
if v['__typename'] == 'Article':
|
||||||
article_map[k] = asset_to_article(v)
|
article_map[k] = asset_to_article(v)
|
||||||
feeds = []
|
feeds = []
|
||||||
for v in data['ROOT_QUERY'].values():
|
for k, v in data['ROOT_QUERY'].items():
|
||||||
if isinstance(v, dict):
|
if k.startswith('workOrLocation'):
|
||||||
for g in data[v['__ref']]['groupings']:
|
for g in data[v['__ref']]['groupings']:
|
||||||
for c in g['containers']:
|
for c in g['containers']:
|
||||||
articles = []
|
articles = []
|
||||||
@ -326,7 +332,9 @@ if __name__ == '__main__':
|
|||||||
html = f.read()
|
html = f.read()
|
||||||
soup = BeautifulSoup(html)
|
soup = BeautifulSoup(html)
|
||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
pprint(parse_web_section(soup))
|
section_title, articles = parse_web_section(soup)
|
||||||
|
print(section_title)
|
||||||
|
pprint(articles)
|
||||||
else:
|
else:
|
||||||
pdate, feeds = parse_todays_page(soup)
|
pdate, feeds = parse_todays_page(soup)
|
||||||
print(pdate)
|
print(pdate)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user