mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update NY Times Headlines
This commit is contained in:
parent
cc48842398
commit
06b4445307
@ -707,59 +707,35 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'http://www.nytimes.com/pages/todaysheadlines/')
|
'http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
section_name = 'Unknown Section'
|
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
for td_col in soup.findAll('td'):
|
section = None
|
||||||
h6_sec_name = td_col.find('h6')
|
articles = []
|
||||||
if h6_sec_name is not None:
|
feeds = []
|
||||||
new_section_name = self.tag_to_string(
|
for h6 in soup.findAll('h6'):
|
||||||
h6_sec_name, use_alt=False)
|
section = self.tag_to_string(h6).strip()
|
||||||
new_section_name = re.sub(r'^ *$', '', new_section_name)
|
articles = []
|
||||||
if new_section_name == '':
|
table = h6.parent.findNextSibling('table')
|
||||||
|
if table is None:
|
||||||
continue
|
continue
|
||||||
section_name = new_section_name
|
for a in table.findAll('a', attrs={'class':'headURL'}):
|
||||||
continue
|
title = self.tag_to_string(a)
|
||||||
atag = td_col.find('a')
|
url = a['href'].partition('?')[0]
|
||||||
if atag is not None:
|
if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list):
|
||||||
h4tag = None
|
|
||||||
for h4tag in atag.findNextSiblings('h4'):
|
|
||||||
break
|
|
||||||
if h4tag is None:
|
|
||||||
continue
|
|
||||||
author = self.tag_to_string(h4tag, use_alt=False)
|
|
||||||
try:
|
|
||||||
url = re.sub(r'\?.*', '', atag['href'])
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
if self.exclude_url(url):
|
|
||||||
continue
|
|
||||||
if '?' in url:
|
|
||||||
url += '&pagewanted=all'
|
|
||||||
else:
|
|
||||||
url += '?pagewanted=all'
|
|
||||||
if self.filterDuplicates:
|
|
||||||
if url in self.url_list:
|
|
||||||
continue
|
continue
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
title = self.tag_to_string(atag, use_alt=False).strip()
|
desc = ''
|
||||||
desc = atag.parent.find('p')
|
h4 = a.findNextSibling('h4')
|
||||||
if desc is not None:
|
if h4 is not None:
|
||||||
description = self.tag_to_string(desc, use_alt=False)
|
desc += self.tag_to_string(h4)
|
||||||
else:
|
p = a.findNextSibling('p')
|
||||||
description = ''
|
if p is not None:
|
||||||
if section_name not in self.articles:
|
desc += ' ' + self.tag_to_string(p)
|
||||||
self.ans.append(section_name)
|
articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc})
|
||||||
self.articles[section_name] = []
|
if articles:
|
||||||
print('Title ' + title + ' author ' + author)
|
feeds.append((section, articles))
|
||||||
self.articles[section_name].append(dict(
|
self.ans = feeds
|
||||||
title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k])
|
|
||||||
for k in self.ans if k in self.articles]
|
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
@ -715,59 +715,35 @@ class NYTimes(BasicNewsRecipe):
|
|||||||
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
|
||||||
|
|
||||||
def parse_headline_index(self):
|
def parse_headline_index(self):
|
||||||
|
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'http://www.nytimes.com/pages/todaysheadlines/')
|
'http://www.nytimes.com/pages/todaysheadlines/')
|
||||||
|
|
||||||
section_name = 'Unknown Section'
|
|
||||||
pubdate = strftime('%a, %d %b')
|
pubdate = strftime('%a, %d %b')
|
||||||
for td_col in soup.findAll('td'):
|
section = None
|
||||||
h6_sec_name = td_col.find('h6')
|
articles = []
|
||||||
if h6_sec_name is not None:
|
feeds = []
|
||||||
new_section_name = self.tag_to_string(
|
for h6 in soup.findAll('h6'):
|
||||||
h6_sec_name, use_alt=False)
|
section = self.tag_to_string(h6).strip()
|
||||||
new_section_name = re.sub(r'^ *$', '', new_section_name)
|
articles = []
|
||||||
if new_section_name == '':
|
table = h6.parent.findNextSibling('table')
|
||||||
|
if table is None:
|
||||||
continue
|
continue
|
||||||
section_name = new_section_name
|
for a in table.findAll('a', attrs={'class':'headURL'}):
|
||||||
continue
|
title = self.tag_to_string(a)
|
||||||
atag = td_col.find('a')
|
url = a['href'].partition('?')[0]
|
||||||
if atag is not None:
|
if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list):
|
||||||
h4tag = None
|
|
||||||
for h4tag in atag.findNextSiblings('h4'):
|
|
||||||
break
|
|
||||||
if h4tag is None:
|
|
||||||
continue
|
|
||||||
author = self.tag_to_string(h4tag, use_alt=False)
|
|
||||||
try:
|
|
||||||
url = re.sub(r'\?.*', '', atag['href'])
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
if self.exclude_url(url):
|
|
||||||
continue
|
|
||||||
if '?' in url:
|
|
||||||
url += '&pagewanted=all'
|
|
||||||
else:
|
|
||||||
url += '?pagewanted=all'
|
|
||||||
if self.filterDuplicates:
|
|
||||||
if url in self.url_list:
|
|
||||||
continue
|
continue
|
||||||
self.url_list.append(url)
|
self.url_list.append(url)
|
||||||
title = self.tag_to_string(atag, use_alt=False).strip()
|
desc = ''
|
||||||
desc = atag.parent.find('p')
|
h4 = a.findNextSibling('h4')
|
||||||
if desc is not None:
|
if h4 is not None:
|
||||||
description = self.tag_to_string(desc, use_alt=False)
|
desc += self.tag_to_string(h4)
|
||||||
else:
|
p = a.findNextSibling('p')
|
||||||
description = ''
|
if p is not None:
|
||||||
if section_name not in self.articles:
|
desc += ' ' + self.tag_to_string(p)
|
||||||
self.ans.append(section_name)
|
articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc})
|
||||||
self.articles[section_name] = []
|
if articles:
|
||||||
print('Title ' + title + ' author ' + author)
|
feeds.append((section, articles))
|
||||||
self.articles[section_name].append(dict(
|
self.ans = feeds
|
||||||
title=title, url=url, date=pubdate, description=description, author=author, content=''))
|
|
||||||
|
|
||||||
self.ans = [(k, self.articles[k])
|
|
||||||
for k in self.ans if k in self.articles]
|
|
||||||
return self.filter_ans(self.ans)
|
return self.filter_ans(self.ans)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user