Update NYTimes recipe

This commit is contained in:
Kovid Goyal 2013-03-17 09:39:22 +05:30
parent a3ee07a2da
commit 3fd23ceadd
2 changed files with 149 additions and 137 deletions

View File

@ -41,7 +41,7 @@ class NYTimes(BasicNewsRecipe):
# number of days old an article can be for inclusion. If oldest_web_article = None all articles # number of days old an article can be for inclusion. If oldest_web_article = None all articles
# will be included. Note: oldest_web_article is ignored if webEdition = False # will be included. Note: oldest_web_article is ignored if webEdition = False
webEdition = False webEdition = False
oldest_web_article = 7 oldest_web_article = None
# download higher resolution images than the small thumbnails typically included in the article # download higher resolution images than the small thumbnails typically included in the article
# the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule', 'relatedSearchesModule',
'side_tool', 'side_tool',
'singleAd', 'singleAd',
'postCategory column',
'refer tagRefer', # added for bits blog post
'entry entry-utility', #added for DealBook 'entry entry-utility', #added for DealBook
'entry-tags', #added for DealBook 'entry-tags', #added for DealBook
'footer promos clearfix', #added for DealBook 'footer promos clearfix', #added for DealBook
@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe):
return True return True
if '/video/' in url: if '/video/' in url:
return True return True
if '/multimedia/' in url:
return True
if '/slideshow/' in url: if '/slideshow/' in url:
return True return True
if '/magazine/index' in url: if '/magazine/index' in url:
@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe):
return True return True
if '/premium/' in url: if '/premium/' in url:
return True return True
if '#comment' in url:
return True
if '#postComment' in url:
return True
if '#postcomment' in url:
return True
if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
print("NO DATE IN "+url)
return True
return False return False
def fixChars(self,string): def fixChars(self,string):
@ -363,6 +376,7 @@ class NYTimes(BasicNewsRecipe):
cover_tag = 'NY_NYT' cover_tag = 'NY_NYT'
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
daysback=1 daysback=1
@ -385,7 +399,6 @@ class NYTimes(BasicNewsRecipe):
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
def short_title(self): def short_title(self):
return self.title return self.title
@ -647,75 +660,53 @@ class NYTimes(BasicNewsRecipe):
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the content table section_name='Unknown Section'
content_table = soup.find('table',{'id':'content'}) pubdate = strftime('%a, %d %b')
if content_table is None: for td_col in soup.findAll('td'):
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") h6_sec_name = td_col.find('h6')
return None if h6_sec_name is not None:
new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections new_section_name = re.sub(r'^ *$','',new_section_name)
if new_section_name == '':
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): continue
for div_sec in td_col.findAll('div',recursive=False): section_name = new_section_name
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): continue
atag = td_col.find('a')
section_name = self.tag_to_string(h6_sec_name,use_alt=False) if atag is not None:
section_name = re.sub(r'^ *$','',section_name) h4tag = None
for h4tag in atag.findNextSiblings('h4'):
if section_name == '': break
if h4tag is None:
continue
author = self.tag_to_string(h4tag,use_alt=False)
try:
url = re.sub(r'\?.*', '', atag['href'])
except:
continue
if self.exclude_url(url):
continue
if '?' in url:
url += '&pagewanted=all'
else:
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
continue continue
if self.includeSections != []: self.url_list.append(url)
if section_name not in self.includeSections: title = self.tag_to_string(atag, use_alt=False).strip()
print "SECTION NOT INCLUDED: ",section_name desc = atag.parent.find('p')
continue if desc is not None:
if section_name in self.excludeSections: description = self.tag_to_string(desc,use_alt=False)
print "SECTION EXCLUDED: ",section_name else:
continue description = ''
if not self.articles.has_key(section_name):
section_name=string.capwords(section_name) self.ans.append(section_name)
section_name = section_name.replace('Op-ed','Op-Ed') self.articles[section_name] = []
section_name = section_name.replace('U.s.','U.S.') print('Title '+title+' author '+author)
section_name = section_name.replace('N.y.','N.Y.') self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
pubdate = strftime('%a, %d %b')
search_div = div_sec
for next_tag in h6_sec_name.findNextSiblings(True):
if next_tag.__class__.__name__ == 'Tag':
if next_tag.name == 'div':
search_div = next_tag
break
# Get the articles
for h3_item in search_div.findAll('h3'):
byline = h3_item.h6
if byline is not None:
author = self.tag_to_string(byline,use_alt=False)
else:
author = ''
a = h3_item.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
if self.exclude_url(url):
continue
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
continue
self.url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
desc = h3_item.find('p')
if desc is not None:
description = self.tag_to_string(desc,use_alt=False)
else:
description = ''
if not self.articles.has_key(section_name):
self.ans.append(section_name)
self.articles[section_name] = []
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.ans)
def parse_index(self): def parse_index(self):
if self.headlinesOnly: if self.headlinesOnly:
@ -825,8 +816,9 @@ class NYTimes(BasicNewsRecipe):
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
if divr.find(text=re.compile('Sign up')): if divr.find(text=re.compile('Sign up')):
divr.extract() divr.extract()
divr = soup.find('div',attrs={'id':re.compile('related-content')}) divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
if divr is not None: if divr is not None:
print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
# handle related articles # handle related articles
rlist = [] rlist = []
ul = divr.find('ul') ul = divr.find('ul')
@ -856,6 +848,8 @@ class NYTimes(BasicNewsRecipe):
asidediv.append(Tag(soup,'hr')) asidediv.append(Tag(soup,'hr'))
smain = soup.find('body') smain = soup.find('body')
smain.append(asidediv) smain.append(asidediv)
else:
print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
for atag in soup.findAll('a'): for atag in soup.findAll('a'):
img = atag.find('img') img = atag.find('img')
if img is not None: if img is not None:
@ -898,6 +892,18 @@ class NYTimes(BasicNewsRecipe):
first_outer = outerdiv first_outer = outerdiv
else: else:
litag.extract() litag.extract()
for h6tag in rdiv.findAll('h6'):
if h6tag.find('a') is not None:
if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
h6tag.find('a')['href'] = url+'?pagewanted=all'
h6tag.extract()
related.append(h6tag)
if first_related is None:
first_related = rdiv
first_outer = outerdiv
else:
h6tag.extract()
if related != []: if related != []:
for r in related: for r in related:
if r.h6: # don't want the anchor inside a h6 tag if r.h6: # don't want the anchor inside a h6 tag

View File

@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule', 'relatedSearchesModule',
'side_tool', 'side_tool',
'singleAd', 'singleAd',
'postCategory column',
'refer tagRefer', # added for bits blog post
'entry entry-utility', #added for DealBook 'entry entry-utility', #added for DealBook
'entry-tags', #added for DealBook 'entry-tags', #added for DealBook
'footer promos clearfix', #added for DealBook 'footer promos clearfix', #added for DealBook
@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe):
return True return True
if '/video/' in url: if '/video/' in url:
return True return True
if '/multimedia/' in url:
return True
if '/slideshow/' in url: if '/slideshow/' in url:
return True return True
if '/magazine/index' in url: if '/magazine/index' in url:
@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe):
return True return True
if '/premium/' in url: if '/premium/' in url:
return True return True
if '#comment' in url:
return True
if '#postComment' in url:
return True
if '#postcomment' in url:
return True
if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
print("NO DATE IN "+url)
return True
return False return False
def fixChars(self,string): def fixChars(self,string):
@ -371,6 +384,7 @@ class NYTimes(BasicNewsRecipe):
cover_tag = 'NY_NYT' cover_tag = 'NY_NYT'
def get_cover_url(self): def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self)
daysback=1 daysback=1
@ -393,7 +407,6 @@ class NYTimes(BasicNewsRecipe):
masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
def short_title(self): def short_title(self):
return self.title return self.title
@ -655,75 +668,53 @@ class NYTimes(BasicNewsRecipe):
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the content table section_name='Unknown Section'
content_table = soup.find('table',{'id':'content'}) pubdate = strftime('%a, %d %b')
if content_table is None: for td_col in soup.findAll('td'):
self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") h6_sec_name = td_col.find('h6')
return None if h6_sec_name is not None:
new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
# Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections new_section_name = re.sub(r'^ *$','',new_section_name)
if new_section_name == '':
for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): continue
for div_sec in td_col.findAll('div',recursive=False): section_name = new_section_name
for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): continue
atag = td_col.find('a')
section_name = self.tag_to_string(h6_sec_name,use_alt=False) if atag is not None:
section_name = re.sub(r'^ *$','',section_name) h4tag = None
for h4tag in atag.findNextSiblings('h4'):
if section_name == '': break
if h4tag is None:
continue
author = self.tag_to_string(h4tag,use_alt=False)
try:
url = re.sub(r'\?.*', '', atag['href'])
except:
continue
if self.exclude_url(url):
continue
if '?' in url:
url += '&pagewanted=all'
else:
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
continue continue
if self.includeSections != []: self.url_list.append(url)
if section_name not in self.includeSections: title = self.tag_to_string(atag, use_alt=False).strip()
print "SECTION NOT INCLUDED: ",section_name desc = atag.parent.find('p')
continue if desc is not None:
if section_name in self.excludeSections: description = self.tag_to_string(desc,use_alt=False)
print "SECTION EXCLUDED: ",section_name else:
continue description = ''
if not self.articles.has_key(section_name):
section_name=string.capwords(section_name) self.ans.append(section_name)
section_name = section_name.replace('Op-ed','Op-Ed') self.articles[section_name] = []
section_name = section_name.replace('U.s.','U.S.') print('Title '+title+' author '+author)
section_name = section_name.replace('N.y.','N.Y.') self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
pubdate = strftime('%a, %d %b')
search_div = div_sec
for next_tag in h6_sec_name.findNextSiblings(True):
if next_tag.__class__.__name__ == 'Tag':
if next_tag.name == 'div':
search_div = next_tag
break
# Get the articles
for h3_item in search_div.findAll('h3'):
byline = h3_item.h6
if byline is not None:
author = self.tag_to_string(byline,use_alt=False)
else:
author = ''
a = h3_item.find('a', href=True)
if not a:
continue
url = re.sub(r'\?.*', '', a['href'])
if self.exclude_url(url):
continue
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
continue
self.url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
desc = h3_item.find('p')
if desc is not None:
description = self.tag_to_string(desc,use_alt=False)
else:
description = ''
if not self.articles.has_key(section_name):
self.ans.append(section_name)
self.articles[section_name] = []
self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)] self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) return self.filter_ans(self.ans)
def parse_index(self): def parse_index(self):
if self.headlinesOnly: if self.headlinesOnly:
@ -833,8 +824,9 @@ class NYTimes(BasicNewsRecipe):
for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}): for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
if divr.find(text=re.compile('Sign up')): if divr.find(text=re.compile('Sign up')):
divr.extract() divr.extract()
divr = soup.find('div',attrs={'id':re.compile('related-content')}) divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
if divr is not None: if divr is not None:
print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
# handle related articles # handle related articles
rlist = [] rlist = []
ul = divr.find('ul') ul = divr.find('ul')
@ -864,6 +856,8 @@ class NYTimes(BasicNewsRecipe):
asidediv.append(Tag(soup,'hr')) asidediv.append(Tag(soup,'hr'))
smain = soup.find('body') smain = soup.find('body')
smain.append(asidediv) smain.append(asidediv)
else:
print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
for atag in soup.findAll('a'): for atag in soup.findAll('a'):
img = atag.find('img') img = atag.find('img')
if img is not None: if img is not None:
@ -906,6 +900,18 @@ class NYTimes(BasicNewsRecipe):
first_outer = outerdiv first_outer = outerdiv
else: else:
litag.extract() litag.extract()
for h6tag in rdiv.findAll('h6'):
if h6tag.find('a') is not None:
if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
h6tag.find('a')['href'] = url+'?pagewanted=all'
h6tag.extract()
related.append(h6tag)
if first_related is None:
first_related = rdiv
first_outer = outerdiv
else:
h6tag.extract()
if related != []: if related != []:
for r in related: for r in related:
if r.h6: # don't want the anchor inside a h6 tag if r.h6: # don't want the anchor inside a h6 tag