mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update New York Times (Web)
Fixes #1816305 [New York Time news not complete](https://bugs.launchpad.net/calibre/+bug/1816305)
This commit is contained in:
parent
692214e589
commit
3abd63304f
@ -226,16 +226,19 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
# raise SystemExit(1)
|
# raise SystemExit(1)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_highlights(self, container):
|
def parse_article_group(self, container):
|
||||||
for article in container.findAll('article', **classes('story')):
|
for li in container.findAll('li'):
|
||||||
|
article = li.find('article')
|
||||||
h2 = article.find('h2')
|
h2 = article.find('h2')
|
||||||
if h2 is not None:
|
if h2 is not None:
|
||||||
title = self.tag_to_string(h2)
|
title = self.tag_to_string(h2)
|
||||||
a = h2.find('a', href=True)
|
a = h2.find('a', href=True)
|
||||||
if a is not None:
|
if a is not None:
|
||||||
url = a['href']
|
url = a['href']
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'https://www.nytimes.com' + url
|
||||||
desc = ''
|
desc = ''
|
||||||
p = article.find(**classes('summary'))
|
p = h2.findNextSibling('p')
|
||||||
if p is not None:
|
if p is not None:
|
||||||
desc = self.tag_to_string(p)
|
desc = self.tag_to_string(p)
|
||||||
date = ''
|
date = ''
|
||||||
@ -257,16 +260,11 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.log('\t\t', article['description'])
|
self.log('\t\t', article['description'])
|
||||||
|
|
||||||
container = soup.find(itemtype='http://schema.org/CollectionPage')
|
container = soup.find(itemtype='http://schema.org/CollectionPage')
|
||||||
highlights = container.find('section', **classes('highlights'))
|
container.find('header').extract()
|
||||||
if highlights is not None:
|
div = container.find('div')
|
||||||
for article in self.parse_highlights(highlights):
|
for section in div.findAll('section'):
|
||||||
log(article)
|
for ol in section.findAll('ol'):
|
||||||
yield article
|
for article in self.parse_article_group(ol):
|
||||||
extra = container.find('section', attrs={'data-collection-type': True})
|
|
||||||
if extra is not None:
|
|
||||||
title = self.tag_to_string(extra.find('h2'))
|
|
||||||
for article in self.parse_highlights(extra):
|
|
||||||
article['title'] = '{}: {}'.format(title, article['title'])
|
|
||||||
log(article)
|
log(article)
|
||||||
yield article
|
yield article
|
||||||
|
|
||||||
|
@ -226,16 +226,19 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
# raise SystemExit(1)
|
# raise SystemExit(1)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def parse_highlights(self, container):
|
def parse_article_group(self, container):
|
||||||
for article in container.findAll('article', **classes('story')):
|
for li in container.findAll('li'):
|
||||||
|
article = li.find('article')
|
||||||
h2 = article.find('h2')
|
h2 = article.find('h2')
|
||||||
if h2 is not None:
|
if h2 is not None:
|
||||||
title = self.tag_to_string(h2)
|
title = self.tag_to_string(h2)
|
||||||
a = h2.find('a', href=True)
|
a = h2.find('a', href=True)
|
||||||
if a is not None:
|
if a is not None:
|
||||||
url = a['href']
|
url = a['href']
|
||||||
|
if url.startswith('/'):
|
||||||
|
url = 'https://www.nytimes.com' + url
|
||||||
desc = ''
|
desc = ''
|
||||||
p = article.find(**classes('summary'))
|
p = h2.findNextSibling('p')
|
||||||
if p is not None:
|
if p is not None:
|
||||||
desc = self.tag_to_string(p)
|
desc = self.tag_to_string(p)
|
||||||
date = ''
|
date = ''
|
||||||
@ -257,16 +260,11 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
self.log('\t\t', article['description'])
|
self.log('\t\t', article['description'])
|
||||||
|
|
||||||
container = soup.find(itemtype='http://schema.org/CollectionPage')
|
container = soup.find(itemtype='http://schema.org/CollectionPage')
|
||||||
highlights = container.find('section', **classes('highlights'))
|
container.find('header').extract()
|
||||||
if highlights is not None:
|
div = container.find('div')
|
||||||
for article in self.parse_highlights(highlights):
|
for section in div.findAll('section'):
|
||||||
log(article)
|
for ol in section.findAll('ol'):
|
||||||
yield article
|
for article in self.parse_article_group(ol):
|
||||||
extra = container.find('section', attrs={'data-collection-type': True})
|
|
||||||
if extra is not None:
|
|
||||||
title = self.tag_to_string(extra.find('h2'))
|
|
||||||
for article in self.parse_highlights(extra):
|
|
||||||
article['title'] = '{}: {}'.format(title, article['title'])
|
|
||||||
log(article)
|
log(article)
|
||||||
yield article
|
yield article
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user