Fix #4009 (NYT Top Stories fails)

This commit is contained in:
Kovid Goyal 2009-11-13 15:36:17 -07:00
parent 339df810b8
commit 8e004db71b

View File

@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
nytimes.com nytimes.com
''' '''
import re import re
import time
from calibre import entity_to_unicode from calibre import entity_to_unicode
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
@ -14,9 +15,9 @@ class NYTimes(BasicNewsRecipe):
title = 'New York Times Top Stories' title = 'New York Times Top Stories'
__author__ = 'GRiker' __author__ = 'GRiker'
language = 'en' language = _('English')
description = 'Top Stories from the New York Times' description = 'Top Stories from the New York Times'
# List of sections typically included in Top Stories. Use a keyword from the # List of sections typically included in Top Stories. Use a keyword from the
# right column in the excludeSectionKeywords[] list to skip downloading that section # right column in the excludeSectionKeywords[] list to skip downloading that section
sections = { sections = {
@ -39,7 +40,7 @@ class NYTimes(BasicNewsRecipe):
'world' : 'World' 'world' : 'World'
} }
# By default, no sections are skipped. # By default, no sections are skipped.
excludeSectionKeywords = [] excludeSectionKeywords = []
# Add section keywords from the right column above to skip that section # Add section keywords from the right column above to skip that section
@ -49,7 +50,7 @@ class NYTimes(BasicNewsRecipe):
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
# Fetch only Top Stories # Fetch only Top Stories
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
# The maximum number of articles that will be downloaded # The maximum number of articles that will be downloaded
max_articles_per_feed = 40 max_articles_per_feed = 40
@ -63,7 +64,7 @@ class NYTimes(BasicNewsRecipe):
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles', dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
'portfolioInline','articleInline','readerscomment', 'portfolioInline','articleInline','readerscomment',
'nytRating']}) ] 'nytRating']}) ]
encoding = 'cp1252' encoding = 'cp1252'
no_stylesheets = True no_stylesheets = True
extra_css = '.headline {text-align: left;}\n \ extra_css = '.headline {text-align: left;}\n \
@ -79,6 +80,14 @@ class NYTimes(BasicNewsRecipe):
.authorId {text-align: left; \ .authorId {text-align: left; \
font-style: italic;}\n ' font-style: italic;}\n '
# def get_cover_url(self):
# st = time.localtime()
# year = str(st.tm_year)
# month = "%.2d" % st.tm_mon
# day = "%.2d" % st.tm_mday
# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
# return cover
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser() br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
@ -105,13 +114,13 @@ class NYTimes(BasicNewsRecipe):
_raw = url_or_raw _raw = url_or_raw
if raw: if raw:
return _raw return _raw
if not isinstance(_raw, unicode) and self.encoding: if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace') _raw = _raw.decode(docEncoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE) massage = list(BeautifulSoup.MARKUP_MASSAGE)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
return BeautifulSoup(_raw, markupMassage=massage) return BeautifulSoup(_raw, markupMassage=massage)
# Entry point # Entry point
soup = get_the_soup( self.encoding, url_or_raw ) soup = get_the_soup( self.encoding, url_or_raw )
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
@ -122,7 +131,7 @@ class NYTimes(BasicNewsRecipe):
if self.verbose > 2: if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding) self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding : if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw) soup = get_the_soup(docEncoding, url_or_raw)
return soup return soup
@ -133,7 +142,7 @@ class NYTimes(BasicNewsRecipe):
feed = key = 'All Top Stories' feed = key = 'All Top Stories'
articles[key] = [] articles[key] = []
ans.append(key) ans.append(key)
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
# Fetch the outer table # Fetch the outer table
@ -179,7 +188,7 @@ class NYTimes(BasicNewsRecipe):
bylines = [] bylines = []
descriptions = [] descriptions = []
pubdate = None pubdate = None
# Get the Section title # Get the Section title
for (x,i) in enumerate(sectionblock.contents) : for (x,i) in enumerate(sectionblock.contents) :
skipThisSection = False skipThisSection = False
@ -201,22 +210,26 @@ class NYTimes(BasicNewsRecipe):
break break
# Get the bylines and descriptions # Get the bylines and descriptions
if not skipThisSection : if not skipThisSection :
for (x,i) in enumerate(sectionblock.contents) : lines = sectionblock.contents
contentStrings = []
# Extract the bylines and descriptions
if (i.string is not None) and \ for line in lines:
(i.string.strip() > "") and \ if not isinstance(line, Comment) and line.strip and line.strip() > "":
not isinstance(i,Comment): contentStrings.append(line.strip())
contentString = i.strip().encode('utf-8')
if contentString[0:3] == 'By ' and contentString[4].isupper() : # Gather the byline/description pairs
bylines.append(contentString) bylines = []
else : descriptions = []
descriptions.append(contentString) for contentString in contentStrings:
if contentString[0:3] == 'By ' and contentString[3].isupper() :
bylines.append(contentString)
else:
descriptions.append(contentString)
# Fetch the article titles and URLs # Fetch the article titles and URLs
articleCount = len(sectionblock.findAll('span')) articleCount = len(sectionblock.findAll('span'))
for (i,span) in enumerate(sectionblock.findAll('span')) : for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
a = span.find('a', href=True) a = span.find('a', href=True)
url = re.sub(r'\?.*', '', a['href']) url = re.sub(r'\?.*', '', a['href'])
url += '?pagewanted=all' url += '?pagewanted=all'
@ -228,7 +241,11 @@ class NYTimes(BasicNewsRecipe):
if not isinstance(title, unicode): if not isinstance(title, unicode):
title = title.decode('utf-8', 'replace') title = title.decode('utf-8', 'replace')
description = descriptions[i] # Allow for unattributed, undescribed entries "Editor's Note"
if i >= len(descriptions) :
description = None
else :
description = descriptions[i]
if len(bylines) == articleCount : if len(bylines) == articleCount :
author = bylines[i] author = bylines[i]
@ -242,10 +259,10 @@ class NYTimes(BasicNewsRecipe):
if url == article['url'] : if url == article['url'] :
duplicateFound = True duplicateFound = True
break break
if duplicateFound: if duplicateFound:
# Continue fetching, don't add this article # Continue fetching, don't add this article
continue continue
if not articles.has_key(feed): if not articles.has_key(feed):
articles[feed] = [] articles[feed] = []
@ -254,7 +271,7 @@ class NYTimes(BasicNewsRecipe):
description=description, author=author, content='')) description=description, author=author, content=''))
ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)] ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans return ans
def strip_anchors(self,soup): def strip_anchors(self,soup):
@ -270,7 +287,7 @@ class NYTimes(BasicNewsRecipe):
# refresh = soup.find('meta', {'http-equiv':'refresh'}) # refresh = soup.find('meta', {'http-equiv':'refresh'})
# if refresh is None: # if refresh is None:
# return self.strip_anchors(soup) # return self.strip_anchors(soup)
# #
# content = refresh.get('content').partition('=')[2] # content = refresh.get('content').partition('=')[2]
# raw = self.browser.open('http://www.nytimes.com'+content).read() # raw = self.browser.open('http://www.nytimes.com'+content).read()
# soup = BeautifulSoup(raw.decode('cp1252', 'replace')) # soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
@ -280,7 +297,7 @@ class NYTimes(BasicNewsRecipe):
content = refresh.get('content').partition('=')[2] content = refresh.get('content').partition('=')[2]
raw = self.browser.open('http://www.nytimes.com'+content).read() raw = self.browser.open('http://www.nytimes.com'+content).read()
soup = BeautifulSoup(raw.decode('cp1252', 'replace')) soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
soup = self.strip_anchors(soup) soup = self.strip_anchors(soup)
# Test for empty content # Test for empty content
@ -291,7 +308,7 @@ class NYTimes(BasicNewsRecipe):
return soup return soup
else: else:
print "no allowed content found, removing article" print "no allowed content found, removing article"
raise Exception() raise StringError
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
@ -334,7 +351,7 @@ class NYTimes(BasicNewsRecipe):
bTag = Tag(soup, "b") bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0]) bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
# Synthesize a section header # Synthesize a section header
dsk = soup.find('meta', attrs={'name':'dsk'}) dsk = soup.find('meta', attrs={'name':'dsk'})
if dsk is not None and dsk.has_key('content'): if dsk is not None and dsk.has_key('content'):
@ -343,12 +360,12 @@ class NYTimes(BasicNewsRecipe):
hTag.insert(0,NavigableString(dsk['content'])) hTag.insert(0,NavigableString(dsk['content']))
articleTag = soup.find(True, attrs={'id':'article'}) articleTag = soup.find(True, attrs={'id':'article'})
articleTag.insert(0,hTag) articleTag.insert(0,hTag)
# Add class="articleBody" to <div> so we can format with CSS # Add class="articleBody" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag is not None : if divTag is not None :
divTag['class'] = divTag['id'] divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS # Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'}) divTag = soup.find('div',attrs={'id':'authorId'})
if divTag is not None : if divTag is not None :