mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #4009 (NYT Top Stories fails)
This commit is contained in:
parent
339df810b8
commit
8e004db71b
@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
nytimes.com
|
||||
'''
|
||||
import re
|
||||
import time
|
||||
from calibre import entity_to_unicode
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
|
||||
@ -14,9 +15,9 @@ class NYTimes(BasicNewsRecipe):
|
||||
|
||||
title = 'New York Times Top Stories'
|
||||
__author__ = 'GRiker'
|
||||
language = 'en'
|
||||
language = _('English')
|
||||
description = 'Top Stories from the New York Times'
|
||||
|
||||
|
||||
# List of sections typically included in Top Stories. Use a keyword from the
|
||||
# right column in the excludeSectionKeywords[] list to skip downloading that section
|
||||
sections = {
|
||||
@ -39,7 +40,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
'world' : 'World'
|
||||
}
|
||||
|
||||
# By default, no sections are skipped.
|
||||
# By default, no sections are skipped.
|
||||
excludeSectionKeywords = []
|
||||
|
||||
# Add section keywords from the right column above to skip that section
|
||||
@ -49,7 +50,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
# excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
|
||||
# Fetch only Top Stories
|
||||
# excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
|
||||
|
||||
|
||||
# The maximum number of articles that will be downloaded
|
||||
max_articles_per_feed = 40
|
||||
|
||||
@ -63,7 +64,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
dict(attrs={ 'id':['toolsRight','inlineBox','sidebarArticles',
|
||||
'portfolioInline','articleInline','readerscomment',
|
||||
'nytRating']}) ]
|
||||
|
||||
|
||||
encoding = 'cp1252'
|
||||
no_stylesheets = True
|
||||
extra_css = '.headline {text-align: left;}\n \
|
||||
@ -79,6 +80,14 @@ class NYTimes(BasicNewsRecipe):
|
||||
.authorId {text-align: left; \
|
||||
font-style: italic;}\n '
|
||||
|
||||
# def get_cover_url(self):
|
||||
# st = time.localtime()
|
||||
# year = str(st.tm_year)
|
||||
# month = "%.2d" % st.tm_mon
|
||||
# day = "%.2d" % st.tm_mday
|
||||
# cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/' + 'scan.jpg'
|
||||
# return cover
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser()
|
||||
if self.username is not None and self.password is not None:
|
||||
@ -105,13 +114,13 @@ class NYTimes(BasicNewsRecipe):
|
||||
_raw = url_or_raw
|
||||
if raw:
|
||||
return _raw
|
||||
|
||||
|
||||
if not isinstance(_raw, unicode) and self.encoding:
|
||||
_raw = _raw.decode(docEncoding, 'replace')
|
||||
massage = list(BeautifulSoup.MARKUP_MASSAGE)
|
||||
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
|
||||
return BeautifulSoup(_raw, markupMassage=massage)
|
||||
|
||||
|
||||
# Entry point
|
||||
soup = get_the_soup( self.encoding, url_or_raw )
|
||||
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
|
||||
@ -122,7 +131,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
if self.verbose > 2:
|
||||
self.log( " document encoding: '%s'" % docEncoding)
|
||||
if docEncoding != self.encoding :
|
||||
soup = get_the_soup(docEncoding, url_or_raw)
|
||||
soup = get_the_soup(docEncoding, url_or_raw)
|
||||
|
||||
return soup
|
||||
|
||||
@ -133,7 +142,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
feed = key = 'All Top Stories'
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
|
||||
|
||||
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
|
||||
|
||||
# Fetch the outer table
|
||||
@ -179,7 +188,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
bylines = []
|
||||
descriptions = []
|
||||
pubdate = None
|
||||
|
||||
|
||||
# Get the Section title
|
||||
for (x,i) in enumerate(sectionblock.contents) :
|
||||
skipThisSection = False
|
||||
@ -201,22 +210,26 @@ class NYTimes(BasicNewsRecipe):
|
||||
break
|
||||
|
||||
# Get the bylines and descriptions
|
||||
if not skipThisSection :
|
||||
for (x,i) in enumerate(sectionblock.contents) :
|
||||
|
||||
# Extract the bylines and descriptions
|
||||
if (i.string is not None) and \
|
||||
(i.string.strip() > "") and \
|
||||
not isinstance(i,Comment):
|
||||
contentString = i.strip().encode('utf-8')
|
||||
if contentString[0:3] == 'By ' and contentString[4].isupper() :
|
||||
bylines.append(contentString)
|
||||
else :
|
||||
descriptions.append(contentString)
|
||||
|
||||
if not skipThisSection :
|
||||
lines = sectionblock.contents
|
||||
contentStrings = []
|
||||
|
||||
for line in lines:
|
||||
if not isinstance(line, Comment) and line.strip and line.strip() > "":
|
||||
contentStrings.append(line.strip())
|
||||
|
||||
# Gather the byline/description pairs
|
||||
bylines = []
|
||||
descriptions = []
|
||||
for contentString in contentStrings:
|
||||
if contentString[0:3] == 'By ' and contentString[3].isupper() :
|
||||
bylines.append(contentString)
|
||||
else:
|
||||
descriptions.append(contentString)
|
||||
|
||||
# Fetch the article titles and URLs
|
||||
articleCount = len(sectionblock.findAll('span'))
|
||||
for (i,span) in enumerate(sectionblock.findAll('span')) :
|
||||
for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
|
||||
a = span.find('a', href=True)
|
||||
url = re.sub(r'\?.*', '', a['href'])
|
||||
url += '?pagewanted=all'
|
||||
@ -228,7 +241,11 @@ class NYTimes(BasicNewsRecipe):
|
||||
if not isinstance(title, unicode):
|
||||
title = title.decode('utf-8', 'replace')
|
||||
|
||||
description = descriptions[i]
|
||||
# Allow for unattributed, undescribed entries "Editor's Note"
|
||||
if i >= len(descriptions) :
|
||||
description = None
|
||||
else :
|
||||
description = descriptions[i]
|
||||
|
||||
if len(bylines) == articleCount :
|
||||
author = bylines[i]
|
||||
@ -242,10 +259,10 @@ class NYTimes(BasicNewsRecipe):
|
||||
if url == article['url'] :
|
||||
duplicateFound = True
|
||||
break
|
||||
|
||||
if duplicateFound:
|
||||
|
||||
if duplicateFound:
|
||||
# Continue fetching, don't add this article
|
||||
continue
|
||||
continue
|
||||
|
||||
if not articles.has_key(feed):
|
||||
articles[feed] = []
|
||||
@ -254,7 +271,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
description=description, author=author, content=''))
|
||||
|
||||
ans = self.sort_index_by(ans, {'Top Stories':-1})
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
|
||||
def strip_anchors(self,soup):
|
||||
@ -270,7 +287,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
# refresh = soup.find('meta', {'http-equiv':'refresh'})
|
||||
# if refresh is None:
|
||||
# return self.strip_anchors(soup)
|
||||
#
|
||||
#
|
||||
# content = refresh.get('content').partition('=')[2]
|
||||
# raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||
# soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
@ -280,7 +297,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
content = refresh.get('content').partition('=')[2]
|
||||
raw = self.browser.open('http://www.nytimes.com'+content).read()
|
||||
soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
|
||||
|
||||
|
||||
soup = self.strip_anchors(soup)
|
||||
|
||||
# Test for empty content
|
||||
@ -291,7 +308,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
return soup
|
||||
else:
|
||||
print "no allowed content found, removing article"
|
||||
raise Exception()
|
||||
raise StringError
|
||||
|
||||
def postprocess_html(self,soup, True):
|
||||
|
||||
@ -334,7 +351,7 @@ class NYTimes(BasicNewsRecipe):
|
||||
bTag = Tag(soup, "b")
|
||||
bTag.insert(0, subhead.contents[0])
|
||||
subhead.replaceWith(bTag)
|
||||
|
||||
|
||||
# Synthesize a section header
|
||||
dsk = soup.find('meta', attrs={'name':'dsk'})
|
||||
if dsk is not None and dsk.has_key('content'):
|
||||
@ -343,12 +360,12 @@ class NYTimes(BasicNewsRecipe):
|
||||
hTag.insert(0,NavigableString(dsk['content']))
|
||||
articleTag = soup.find(True, attrs={'id':'article'})
|
||||
articleTag.insert(0,hTag)
|
||||
|
||||
|
||||
# Add class="articleBody" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'articleBody'})
|
||||
if divTag is not None :
|
||||
divTag['class'] = divTag['id']
|
||||
|
||||
|
||||
# Add class="authorId" to <div> so we can format with CSS
|
||||
divTag = soup.find('div',attrs={'id':'authorId'})
|
||||
if divTag is not None :
|
||||
|
Loading…
x
Reference in New Issue
Block a user