From 06b444530705cb13f580c2768db40071dc6d9ad2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 11 Feb 2017 13:00:12 +0530 Subject: [PATCH] Update NY Times Headlines --- recipes/nytimes.recipe | 70 +++++++++++++------------------------- recipes/nytimes_sub.recipe | 70 +++++++++++++------------------------- 2 files changed, 46 insertions(+), 94 deletions(-) diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index b7b490d208..96f89e02db 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -707,59 +707,35 @@ class NYTimes(BasicNewsRecipe): return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_headline_index(self): - soup = self.index_to_soup( 'http://www.nytimes.com/pages/todaysheadlines/') - - section_name = 'Unknown Section' pubdate = strftime('%a, %d %b') - for td_col in soup.findAll('td'): - h6_sec_name = td_col.find('h6') - if h6_sec_name is not None: - new_section_name = self.tag_to_string( - h6_sec_name, use_alt=False) - new_section_name = re.sub(r'^ *$', '', new_section_name) - if new_section_name == '': - continue - section_name = new_section_name + section = None + articles = [] + feeds = [] + for h6 in soup.findAll('h6'): + section = self.tag_to_string(h6).strip() + articles = [] + table = h6.parent.findNextSibling('table') + if table is None: continue - atag = td_col.find('a') - if atag is not None: - h4tag = None - for h4tag in atag.findNextSiblings('h4'): - break - if h4tag is None: + for a in table.findAll('a', attrs={'class':'headURL'}): + title = self.tag_to_string(a) + url = a['href'].partition('?')[0] + if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list): continue - author = self.tag_to_string(h4tag, use_alt=False) - try: - url = re.sub(r'\?.*', '', atag['href']) - except: - continue - if self.exclude_url(url): - continue - if '?' in url: - url += '&pagewanted=all' - else: - url += '?pagewanted=all' - if self.filterDuplicates: - if url in self.url_list: - continue self.url_list.append(url) - title = self.tag_to_string(atag, use_alt=False).strip() - desc = atag.parent.find('p') - if desc is not None: - description = self.tag_to_string(desc, use_alt=False) - else: - description = '' - if section_name not in self.articles: - self.ans.append(section_name) - self.articles[section_name] = [] - print('Title ' + title + ' author ' + author) - self.articles[section_name].append(dict( - title=title, url=url, date=pubdate, description=description, author=author, content='')) - - self.ans = [(k, self.articles[k]) - for k in self.ans if k in self.articles] + desc = '' + h4 = a.findNextSibling('h4') + if h4 is not None: + desc += self.tag_to_string(h4) + p = a.findNextSibling('p') + if p is not None: + desc += ' ' + self.tag_to_string(p) + articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc}) + if articles: + feeds.append((section, articles)) + self.ans = feeds return self.filter_ans(self.ans) def parse_index(self): diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 0e9c1cce22..fa9321f0d4 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -715,59 +715,35 @@ class NYTimes(BasicNewsRecipe): return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) def parse_headline_index(self): - soup = self.index_to_soup( 'http://www.nytimes.com/pages/todaysheadlines/') - - section_name = 'Unknown Section' pubdate = strftime('%a, %d %b') - for td_col in soup.findAll('td'): - h6_sec_name = td_col.find('h6') - if h6_sec_name is not None: - new_section_name = self.tag_to_string( - h6_sec_name, use_alt=False) - new_section_name = re.sub(r'^ *$', '', new_section_name) - if new_section_name == '': - continue - section_name = new_section_name + section = None + articles = [] + feeds = [] + for h6 in soup.findAll('h6'): + section = self.tag_to_string(h6).strip() + articles = [] + table = h6.parent.findNextSibling('table') + if table is None: continue - atag = td_col.find('a') - if atag is not None: - h4tag = None - for h4tag in atag.findNextSiblings('h4'): - break - if h4tag is None: + for a in table.findAll('a', attrs={'class':'headURL'}): + title = self.tag_to_string(a) + url = a['href'].partition('?')[0] + if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list): continue - author = self.tag_to_string(h4tag, use_alt=False) - try: - url = re.sub(r'\?.*', '', atag['href']) - except: - continue - if self.exclude_url(url): - continue - if '?' in url: - url += '&pagewanted=all' - else: - url += '?pagewanted=all' - if self.filterDuplicates: - if url in self.url_list: - continue self.url_list.append(url) - title = self.tag_to_string(atag, use_alt=False).strip() - desc = atag.parent.find('p') - if desc is not None: - description = self.tag_to_string(desc, use_alt=False) - else: - description = '' - if section_name not in self.articles: - self.ans.append(section_name) - self.articles[section_name] = [] - print('Title ' + title + ' author ' + author) - self.articles[section_name].append(dict( - title=title, url=url, date=pubdate, description=description, author=author, content='')) - - self.ans = [(k, self.articles[k]) - for k in self.ans if k in self.articles] + desc = '' + h4 = a.findNextSibling('h4') + if h4 is not None: + desc += self.tag_to_string(h4) + p = a.findNextSibling('p') + if p is not None: + desc += ' ' + self.tag_to_string(p) + articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc}) + if articles: + feeds.append((section, articles)) + self.ans = feeds return self.filter_ans(self.ans) def parse_index(self):