mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Update NYTimes
Merge branch 'nytimes' of https://github.com/jtmcdole/calibre
This commit is contained in:
commit
63ea8aa1f4
@ -128,7 +128,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
tech_feeds = [
|
tech_feeds = [
|
||||||
(u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'),
|
(u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'),
|
||||||
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
|
||||||
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||||||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||||
]
|
]
|
||||||
@ -499,6 +498,21 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
thumbnail = div.find('div', 'thumbnail')
|
thumbnail = div.find('div', 'thumbnail')
|
||||||
if thumbnail:
|
if thumbnail:
|
||||||
thumbnail.extract()
|
thumbnail.extract()
|
||||||
|
return self.handle_base_article(div)
|
||||||
|
|
||||||
|
# Handle '<article>' in world, u.s., etc
|
||||||
|
def handle_article_tag(self, div):
|
||||||
|
thumbnail = div.find('figure', 'media photo')
|
||||||
|
if not thumbnail:
|
||||||
|
thumbnail = div.find('div', 'thumb')
|
||||||
|
if thumbnail:
|
||||||
|
thumbnail.extract()
|
||||||
|
div = div.find('div', 'story-body')
|
||||||
|
if not div:
|
||||||
|
return
|
||||||
|
return self.handle_base_article(div)
|
||||||
|
|
||||||
|
def handle_base_article(self, div):
|
||||||
a = div.find('a', href=True)
|
a = div.find('a', href=True)
|
||||||
if not a:
|
if not a:
|
||||||
return
|
return
|
||||||
@ -648,10 +662,12 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'https://www.nytimes.com/pages/' + index_url + '/index.html')
|
'https://www.nytimes.com/pages/' + index_url + '/index.html')
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
print 'Index URL: ' + 'http://www.nytimes.com/pages/' + index_url + '/index.html'
|
print 'Index URL: ' + 'https://www.nytimes.com/pages/' + index_url + '/index.html'
|
||||||
|
|
||||||
self.key = sec_title
|
self.key = sec_title
|
||||||
# Find each article
|
# Find each article
|
||||||
|
for div in soup.findAll('article'):
|
||||||
|
self.handle_article_tag(div)
|
||||||
for div in soup.findAll(True, attrs={
|
for div in soup.findAll(True, attrs={
|
||||||
'class': ['section-headline', 'ledeStory', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}):
|
'class': ['section-headline', 'ledeStory', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}):
|
||||||
if div['class'] in ['story', 'story headline', 'storyHeader']:
|
if div['class'] in ['story', 'story headline', 'storyHeader']:
|
||||||
|
@ -128,7 +128,6 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
|
|
||||||
tech_feeds = [
|
tech_feeds = [
|
||||||
(u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'),
|
(u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'),
|
||||||
(u'Tech - Bits', u'http://bits.blogs.nytimes.com/feed/'),
|
|
||||||
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
(u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
||||||
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
(u'Tech - Open', u'http://open.blogs.nytimes.com/feed/')
|
||||||
]
|
]
|
||||||
@ -507,6 +506,21 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
thumbnail = div.find('div', 'thumbnail')
|
thumbnail = div.find('div', 'thumbnail')
|
||||||
if thumbnail:
|
if thumbnail:
|
||||||
thumbnail.extract()
|
thumbnail.extract()
|
||||||
|
return self.handle_base_article(div)
|
||||||
|
|
||||||
|
# Handle '<article>' in world, u.s., etc
|
||||||
|
def handle_article_tag(self, div):
|
||||||
|
thumbnail = div.find('figure', 'media photo')
|
||||||
|
if not thumbnail:
|
||||||
|
thumbnail = div.find('div', 'thumb')
|
||||||
|
if thumbnail:
|
||||||
|
thumbnail.extract()
|
||||||
|
div = div.find('div', 'story-body')
|
||||||
|
if not div:
|
||||||
|
return
|
||||||
|
return self.handle_base_article(div)
|
||||||
|
|
||||||
|
def handle_base_article(self, div):
|
||||||
a = div.find('a', href=True)
|
a = div.find('a', href=True)
|
||||||
if not a:
|
if not a:
|
||||||
return
|
return
|
||||||
@ -656,12 +670,14 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
'https://www.nytimes.com/pages/' + index_url + '/index.html')
|
'https://www.nytimes.com/pages/' + index_url + '/index.html')
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
print 'Index URL: ' + 'http://www.nytimes.com/pages/' + index_url + '/index.html'
|
print 'Index URL: ' + 'https://www.nytimes.com/pages/' + index_url + '/index.html'
|
||||||
|
|
||||||
self.key = sec_title
|
self.key = sec_title
|
||||||
# Find each article
|
# Find each article
|
||||||
for div in soup.findAll(True, attrs={'class': [
|
for div in soup.findAll('article'):
|
||||||
'section-headline', 'ledeStory', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}):
|
self.handle_article_tag(div)
|
||||||
|
for div in soup.findAll(True, attrs={
|
||||||
|
'class': ['section-headline', 'ledeStory', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}):
|
||||||
if div['class'] in ['story', 'story headline', 'storyHeader']:
|
if div['class'] in ['story', 'story headline', 'storyHeader']:
|
||||||
self.handle_article(div)
|
self.handle_article(div)
|
||||||
elif div['class'] == 'ledeStory':
|
elif div['class'] == 'ledeStory':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user