Updated NY Times

This commit is contained in:
Kovid Goyal 2011-01-18 13:49:12 -07:00
parent 54fb874621
commit ebda738c81

View File

@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule', 'relatedSearchesModule',
'side_tool', 'side_tool',
'singleAd', 'singleAd',
'entry entry-utility', #added for DealBook
'entry-tags', #added for DealBook
'footer promos clearfix', #added for DealBook
'footer links clearfix', #added for DealBook
'inlineImage module', #added for DealBook
re.compile('^subNavigation'), re.compile('^subNavigation'),
re.compile('^leaderboard'), re.compile('^leaderboard'),
re.compile('^module'), re.compile('^module'),
@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
'side_index', 'side_index',
'side_tool', 'side_tool',
'toolsRight', 'toolsRight',
'skybox', #added for DealBook
'TopAd', #added for DealBook
'related-content', #added for DealBook
]), ]),
dict(name=['script', 'noscript', 'style','form','hr'])] dict(name=['script', 'noscript', 'style','form','hr'])]
no_stylesheets = True no_stylesheets = True
@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
def exclude_url(self,url): def exclude_url(self,url):
if not url.startswith("http"): if not url.startswith("http"):
return True return True
if not url.endswith(".html"): if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
return True return True
if 'nytimes.com' not in url: if 'nytimes.com' not in url:
return True return True
@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
if self.webEdition & (self.oldest_article>0): if self.webEdition & (self.oldest_article>0):
date_tag = soup.find(True,attrs={'class': ['dateline','date']}) date_tag = soup.find(True,attrs={'class': ['dateline','date']})
if date_tag: if date_tag:
@ -592,128 +599,168 @@ class NYTimes(BasicNewsRecipe):
img_div = soup.find('div','inlineImage module') img_div = soup.find('div','inlineImage module')
if img_div: if img_div:
img_div.extract() img_div.extract()
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
try:
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
try: try:
# Change captions to italic if self.one_picture_per_article:
for caption in soup.findAll(True, {'class':'caption'}) : # Remove all images after first
if caption and len(caption) > 0: largeImg = soup.find(True, {'class':'articleSpanImage'})
cTag = Tag(soup, "p", [("class", "caption")]) inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() if largeImg:
mp_off = c.find("More Photos") for inlineImg in inlineImgs:
if mp_off >= 0: inlineImg.extract()
c = c[:mp_off] else:
cTag.insert(0, c) if inlineImgs:
caption.replaceWith(cTag) firstImg = inlineImgs[0]
except: for inlineImg in inlineImgs[1:]:
self.log("ERROR: Problem in change captions to italic") inlineImg.extract()
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
try: try:
# Change <nyt_headline> to <h2> # Change captions to italic
h1 = soup.find('h1') for caption in soup.findAll(True, {'class':'caption'}) :
if h1: if caption and len(caption) > 0:
headline = h1.find("nyt_headline") cTag = Tag(soup, "p", [("class", "caption")])
if headline: c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
tag = Tag(soup, "h2") mp_off = c.find("More Photos")
tag['class'] = "headline" if mp_off >= 0:
tag.insert(0, self.fixChars(headline.contents[0])) c = c[:mp_off]
h1.replaceWith(tag) cTag.insert(0, c)
else: caption.replaceWith(cTag)
# Blog entry - replace headline, remove <hr> tags except:
headline = soup.find('title') self.log("ERROR: Problem in change captions to italic")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try: try:
# Change <h1> to <h3> - used in editorial blogs # Change <nyt_headline> to <h2>
masthead = soup.find("h1") h1 = soup.find('h1')
if masthead: blogheadline = str(h1) #added for dealbook
# Nuke the href if h1:
if masthead.a: headline = h1.find("nyt_headline")
del(masthead.a['href']) if headline:
tag = Tag(soup, "h3") tag = Tag(soup, "h2")
tag.insert(0, self.fixChars(masthead.contents[0])) tag['class'] = "headline"
masthead.replaceWith(tag) tag.insert(0, self.fixChars(headline.contents[0]))
except: h1.replaceWith(tag)
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") elif blogheadline.find('entry-title'):#added for dealbook
tag = Tag(soup, "h2")#added for dealbook
tag['class'] = "headline"#added for dealbook
tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
h1.replaceWith(tag)#added for dealbook
try: else:
# Change <span class="bold"> to <b> # Blog entry - replace headline, remove <hr> tags - BCC I think this is no longer functional 1-18-2011
for subhead in soup.findAll(True, {'class':'bold'}) : headline = soup.find('title')
if subhead.contents: if headline:
bTag = Tag(soup, "b") tag = Tag(soup, "h2")
bTag.insert(0, subhead.contents[0]) tag['class'] = "headline"
subhead.replaceWith(bTag) tag.insert(0, self.fixChars(headline.renderContents()))
except: soup.insert(0, tag)
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs") hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try: try:
divTag = soup.find('div',attrs={'id':'articleBody'}) #if this is from a blog (dealbook, fix the byline format
if divTag: bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
divTag['class'] = divTag['id'] if bylineauthor:
except: tag = Tag(soup, "h6")
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") tag['class'] = "byline"
tag.insert(0, self.fixChars(bylineauthor.renderContents()))
bylineauthor.replaceWith(tag)
except:
self.log("ERROR: fixing byline author format")
try: try:
# Add class="authorId" to <div> so we can format with CSS #if this is a blog (dealbook) fix the credit style for the pictures
divTag = soup.find('div',attrs={'id':'authorId'}) blogcredit = soup.find('div',attrs={'class':'credit'})
if divTag and divTag.contents[0]: if blogcredit:
tag = Tag(soup, "p") tag = Tag(soup, "h6")
tag['class'] = "authorId" tag['class'] = "credit"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], tag.insert(0, self.fixChars(blogcredit.renderContents()))
use_alt=False))) blogcredit.replaceWith(tag)
divTag.replaceWith(tag) except:
except: self.log("ERROR: fixing credit format")
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup
try:
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
#remove the <strong> update tag
blogupdated = soup.find('span', {'class':'update'})
if blogupdated:
blogupdated.replaceWith("")
except:
self.log("ERROR: Removing strong tag")
try:
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
except:
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
try:
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
except:
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
shortparagraph = "" shortparagraph = ""
try: try: