Make postprocess_html in the NY Times recipes more robust

This commit is contained in:
Kovid Goyal 2011-01-17 13:10:10 -07:00
parent aa28b37951
commit 84d1dd94d2
2 changed files with 229 additions and 189 deletions

View File

@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
try:
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
if self.one_picture_per_article: try:
# Remove all images after first # Change captions to italic
largeImg = soup.find(True, {'class':'articleSpanImage'}) for caption in soup.findAll(True, {'class':'caption'}) :
inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) if caption and len(caption) > 0:
if largeImg: cTag = Tag(soup, "p", [("class", "caption")])
for inlineImg in inlineImgs: c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
inlineImg.extract() mp_off = c.find("More Photos")
else: if mp_off >= 0:
if inlineImgs: c = c[:mp_off]
firstImg = inlineImgs[0] cTag.insert(0, c)
for inlineImg in inlineImgs[1:]: caption.replaceWith(cTag)
inlineImg.extract() except:
# Move firstImg before article body self.log("ERROR: Problem in change captions to italic")
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic try:
for caption in soup.findAll(True, {'class':'caption'}) : # Change <nyt_headline> to <h2>
if caption and caption.contents[0]: h1 = soup.find('h1')
cTag = Tag(soup, "p", [("class", "caption")]) if h1:
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() headline = h1.find("nyt_headline")
mp_off = c.find("More Photos") if headline:
if mp_off >= 0: tag = Tag(soup, "h2")
c = c[:mp_off] tag['class'] = "headline"
cTag.insert(0, c) tag.insert(0, self.fixChars(headline.contents[0]))
caption.replaceWith(cTag) h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
# Change <nyt_headline> to <h2> try:
h1 = soup.find('h1') # Change <h1> to <h3> - used in editorial blogs
if h1: masthead = soup.find("h1")
headline = h1.find("nyt_headline") if masthead:
if headline: # Nuke the href
tag = Tag(soup, "h2") if masthead.a:
tag['class'] = "headline" del(masthead.a['href'])
tag.insert(0, self.fixChars(headline.contents[0])) tag = Tag(soup, "h3")
h1.replaceWith(tag) tag.insert(0, self.fixChars(masthead.contents[0]))
else: masthead.replaceWith(tag)
# Blog entry - replace headline, remove <hr> tags except:
headline = soup.find('title') self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs try:
masthead = soup.find("h1") # Change <span class="bold"> to <b>
if masthead: for subhead in soup.findAll(True, {'class':'bold'}) :
# Nuke the href if subhead.contents:
if masthead.a: bTag = Tag(soup, "b")
del(masthead.a['href']) bTag.insert(0, subhead.contents[0])
tag = Tag(soup, "h3") subhead.replaceWith(bTag)
tag.insert(0, self.fixChars(masthead.contents[0])) except:
masthead.replaceWith(tag) self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
# Change <span class="bold"> to <b> try:
for subhead in soup.findAll(True, {'class':'bold'}) : divTag = soup.find('div',attrs={'id':'articleBody'})
if subhead.contents: if divTag:
bTag = Tag(soup, "b") divTag['class'] = divTag['id']
bTag.insert(0, subhead.contents[0]) except:
subhead.replaceWith(bTag) self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
divTag = soup.find('div',attrs={'id':'articleBody'}) try:
if divTag: # Add class="authorId" to <div> so we can format with CSS
divTag['class'] = divTag['id'] divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
except:
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
# Add class="authorId" to <div> so we can format with CSS return soup
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
shortparagraph = "" shortparagraph = ""

View File

@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
try:
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
try:
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and len(caption) > 0:
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
cTag.insert(0, c)
caption.replaceWith(cTag)
except:
self.log("ERROR: Problem in change captions to italic")
try:
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
if self.one_picture_per_article: try:
# Remove all images after first # Change <h1> to <h3> - used in editorial blogs
largeImg = soup.find(True, {'class':'articleSpanImage'}) masthead = soup.find("h1")
inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) if masthead:
if largeImg: # Nuke the href
for inlineImg in inlineImgs: if masthead.a:
inlineImg.extract() del(masthead.a['href'])
else: tag = Tag(soup, "h3")
if inlineImgs: tag.insert(0, self.fixChars(masthead.contents[0]))
firstImg = inlineImgs[0] masthead.replaceWith(tag)
for inlineImg in inlineImgs[1:]: except:
inlineImg.extract() self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic try:
for caption in soup.findAll(True, {'class':'caption'}) : # Change <span class="bold"> to <b>
if caption and caption.contents[0]: for subhead in soup.findAll(True, {'class':'bold'}) :
cTag = Tag(soup, "p", [("class", "caption")]) if subhead.contents:
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() bTag = Tag(soup, "b")
mp_off = c.find("More Photos") bTag.insert(0, subhead.contents[0])
if mp_off >= 0: subhead.replaceWith(bTag)
c = c[:mp_off] except:
cTag.insert(0, c) self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
caption.replaceWith(cTag)
try:
# Change <nyt_headline> to <h2> divTag = soup.find('div',attrs={'id':'articleBody'})
h1 = soup.find('h1') if divTag:
if h1: divTag['class'] = divTag['id']
headline = h1.find("nyt_headline") except:
if headline: self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
tag = Tag(soup, "h2")
tag['class'] = "headline" try:
tag.insert(0, self.fixChars(headline.contents[0])) # Add class="authorId" to <div> so we can format with CSS
h1.replaceWith(tag) divTag = soup.find('div',attrs={'id':'authorId'})
else: if divTag and divTag.contents[0]:
# Blog entry - replace headline, remove <hr> tags tag = Tag(soup, "p")
headline = soup.find('title') tag['class'] = "authorId"
if headline: tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
tag = Tag(soup, "h2") use_alt=False)))
tag['class'] = "headline" divTag.replaceWith(tag)
tag.insert(0, self.fixChars(headline.contents[0])) except:
soup.insert(0, tag) self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
hrs = soup.findAll('hr')
for hr in hrs: return soup
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
shortparagraph = "" shortparagraph = ""
try: try: