mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update The New York Times
This commit is contained in:
parent
8b1ae42869
commit
fdbf44e3bd
@ -98,7 +98,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
dict(href='#site-content #site-index'.split()),
|
dict(href='#site-content #site-index'.split()),
|
||||||
dict(attrs={'aria-hidden':'true'}),
|
dict(attrs={'aria-hidden':'true'}),
|
||||||
dict(attrs={'data-videoid':True}),
|
dict(attrs={'data-videoid':True}),
|
||||||
dict(name='button meta link'.split()),
|
dict(name='button meta link time source'.split()),
|
||||||
dict(id=lambda x: x and x.startswith('story-ad-')),
|
dict(id=lambda x: x and x.startswith('story-ad-')),
|
||||||
dict(name='head'),
|
dict(name='head'),
|
||||||
dict(role='toolbar'),
|
dict(role='toolbar'),
|
||||||
@ -113,17 +113,20 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
article = soup.find(id='story')
|
article = soup.find(id='story')
|
||||||
# The NYT is apparently A/B testing a new page layout
|
if article is None:
|
||||||
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
|
keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
|
||||||
if has_supplemental:
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(id='story-header'),
|
|
||||||
classes('story-body-supplemental story-interrupter'),
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
keep_only_tags = [
|
# The NYT is apparently A/B testing a new page layout
|
||||||
dict(id='story'),
|
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
|
||||||
]
|
if has_supplemental:
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id='story-header'),
|
||||||
|
classes('story-body-supplemental story-interrupter'),
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id='story'),
|
||||||
|
]
|
||||||
body = new_tag(soup, 'body')
|
body = new_tag(soup, 'body')
|
||||||
for spec in keep_only_tags:
|
for spec in keep_only_tags:
|
||||||
for tag in soup.find('body').findAll(**spec):
|
for tag in soup.find('body').findAll(**spec):
|
||||||
@ -153,6 +156,11 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
|
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
|
||||||
span.name = 'img'
|
span.name = 'img'
|
||||||
span['src'] = div['itemid']
|
span['src'] = div['itemid']
|
||||||
|
|
||||||
|
# Remove live storline menu
|
||||||
|
for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
|
||||||
|
span.parent.extract()
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
|
@ -98,7 +98,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
dict(href='#site-content #site-index'.split()),
|
dict(href='#site-content #site-index'.split()),
|
||||||
dict(attrs={'aria-hidden':'true'}),
|
dict(attrs={'aria-hidden':'true'}),
|
||||||
dict(attrs={'data-videoid':True}),
|
dict(attrs={'data-videoid':True}),
|
||||||
dict(name='button meta link'.split()),
|
dict(name='button meta link time source'.split()),
|
||||||
dict(id=lambda x: x and x.startswith('story-ad-')),
|
dict(id=lambda x: x and x.startswith('story-ad-')),
|
||||||
dict(name='head'),
|
dict(name='head'),
|
||||||
dict(role='toolbar'),
|
dict(role='toolbar'),
|
||||||
@ -113,17 +113,20 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
article = soup.find(id='story')
|
article = soup.find(id='story')
|
||||||
# The NYT is apparently A/B testing a new page layout
|
if article is None:
|
||||||
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
|
keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
|
||||||
if has_supplemental:
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(id='story-header'),
|
|
||||||
classes('story-body-supplemental story-interrupter'),
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
keep_only_tags = [
|
# The NYT is apparently A/B testing a new page layout
|
||||||
dict(id='story'),
|
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
|
||||||
]
|
if has_supplemental:
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id='story-header'),
|
||||||
|
classes('story-body-supplemental story-interrupter'),
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
keep_only_tags = [
|
||||||
|
dict(id='story'),
|
||||||
|
]
|
||||||
body = new_tag(soup, 'body')
|
body = new_tag(soup, 'body')
|
||||||
for spec in keep_only_tags:
|
for spec in keep_only_tags:
|
||||||
for tag in soup.find('body').findAll(**spec):
|
for tag in soup.find('body').findAll(**spec):
|
||||||
@ -153,6 +156,11 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
|
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
|
||||||
span.name = 'img'
|
span.name = 'img'
|
||||||
span['src'] = div['itemid']
|
span['src'] = div['itemid']
|
||||||
|
|
||||||
|
# Remove live storline menu
|
||||||
|
for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
|
||||||
|
span.parent.extract()
|
||||||
|
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
@ -310,7 +318,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('All articles', [
|
# return [('All articles', [
|
||||||
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2018/05/31/theater/best-25-plays-how-we-made-the-list.html'},
|
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
||||||
# ])]
|
# ])]
|
||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user