Update NY Times Headlines

This commit is contained in:
Kovid Goyal 2017-02-11 13:00:12 +05:30
parent cc48842398
commit 06b4445307
2 changed files with 46 additions and 94 deletions

View File

@ -707,59 +707,35 @@ class NYTimes(BasicNewsRecipe):
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
def parse_headline_index(self): def parse_headline_index(self):
soup = self.index_to_soup( soup = self.index_to_soup(
'http://www.nytimes.com/pages/todaysheadlines/') 'http://www.nytimes.com/pages/todaysheadlines/')
section_name = 'Unknown Section'
pubdate = strftime('%a, %d %b') pubdate = strftime('%a, %d %b')
for td_col in soup.findAll('td'): section = None
h6_sec_name = td_col.find('h6') articles = []
if h6_sec_name is not None: feeds = []
new_section_name = self.tag_to_string( for h6 in soup.findAll('h6'):
h6_sec_name, use_alt=False) section = self.tag_to_string(h6).strip()
new_section_name = re.sub(r'^ *$', '', new_section_name) articles = []
if new_section_name == '': table = h6.parent.findNextSibling('table')
if table is None:
continue continue
section_name = new_section_name for a in table.findAll('a', attrs={'class':'headURL'}):
continue title = self.tag_to_string(a)
atag = td_col.find('a') url = a['href'].partition('?')[0]
if atag is not None: if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list):
h4tag = None
for h4tag in atag.findNextSiblings('h4'):
break
if h4tag is None:
continue
author = self.tag_to_string(h4tag, use_alt=False)
try:
url = re.sub(r'\?.*', '', atag['href'])
except:
continue
if self.exclude_url(url):
continue
if '?' in url:
url += '&pagewanted=all'
else:
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
continue continue
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag, use_alt=False).strip() desc = ''
desc = atag.parent.find('p') h4 = a.findNextSibling('h4')
if desc is not None: if h4 is not None:
description = self.tag_to_string(desc, use_alt=False) desc += self.tag_to_string(h4)
else: p = a.findNextSibling('p')
description = '' if p is not None:
if section_name not in self.articles: desc += ' ' + self.tag_to_string(p)
self.ans.append(section_name) articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc})
self.articles[section_name] = [] if articles:
print('Title ' + title + ' author ' + author) feeds.append((section, articles))
self.articles[section_name].append(dict( self.ans = feeds
title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k])
for k in self.ans if k in self.articles]
return self.filter_ans(self.ans) return self.filter_ans(self.ans)
def parse_index(self): def parse_index(self):

View File

@ -715,59 +715,35 @@ class NYTimes(BasicNewsRecipe):
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
def parse_headline_index(self): def parse_headline_index(self):
soup = self.index_to_soup( soup = self.index_to_soup(
'http://www.nytimes.com/pages/todaysheadlines/') 'http://www.nytimes.com/pages/todaysheadlines/')
section_name = 'Unknown Section'
pubdate = strftime('%a, %d %b') pubdate = strftime('%a, %d %b')
for td_col in soup.findAll('td'): section = None
h6_sec_name = td_col.find('h6') articles = []
if h6_sec_name is not None: feeds = []
new_section_name = self.tag_to_string( for h6 in soup.findAll('h6'):
h6_sec_name, use_alt=False) section = self.tag_to_string(h6).strip()
new_section_name = re.sub(r'^ *$', '', new_section_name) articles = []
if new_section_name == '': table = h6.parent.findNextSibling('table')
if table is None:
continue continue
section_name = new_section_name for a in table.findAll('a', attrs={'class':'headURL'}):
continue title = self.tag_to_string(a)
atag = td_col.find('a') url = a['href'].partition('?')[0]
if atag is not None: if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list):
h4tag = None
for h4tag in atag.findNextSiblings('h4'):
break
if h4tag is None:
continue
author = self.tag_to_string(h4tag, use_alt=False)
try:
url = re.sub(r'\?.*', '', atag['href'])
except:
continue
if self.exclude_url(url):
continue
if '?' in url:
url += '&pagewanted=all'
else:
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
continue continue
self.url_list.append(url) self.url_list.append(url)
title = self.tag_to_string(atag, use_alt=False).strip() desc = ''
desc = atag.parent.find('p') h4 = a.findNextSibling('h4')
if desc is not None: if h4 is not None:
description = self.tag_to_string(desc, use_alt=False) desc += self.tag_to_string(h4)
else: p = a.findNextSibling('p')
description = '' if p is not None:
if section_name not in self.articles: desc += ' ' + self.tag_to_string(p)
self.ans.append(section_name) articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc})
self.articles[section_name] = [] if articles:
print('Title ' + title + ' author ' + author) feeds.append((section, articles))
self.articles[section_name].append(dict( self.ans = feeds
title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k])
for k in self.ans if k in self.articles]
return self.filter_ans(self.ans) return self.filter_ans(self.ans)
def parse_index(self): def parse_index(self):