Make postprocess_html in the NY Times recipes more robust

This commit is contained in:
Kovid Goyal 2011-01-17 13:10:10 -07:00
parent aa28b37951
commit 84d1dd94d2
2 changed files with 229 additions and 189 deletions

View File

@ -586,7 +586,7 @@ class NYTimes(BasicNewsRecipe):
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
try:
if self.one_picture_per_article: if self.one_picture_per_article:
# Remove all images after first # Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'}) largeImg = soup.find(True, {'class':'articleSpanImage'})
@ -621,10 +621,13 @@ class NYTimes(BasicNewsRecipe):
cgFirst.insert(insertLoc,firstImg) cgFirst.insert(insertLoc,firstImg)
else: else:
self.log(">>> No class:'columnGroup first' found <<<") self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
try:
# Change captions to italic # Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) : for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]: if caption and len(caption) > 0:
cTag = Tag(soup, "p", [("class", "caption")]) cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos") mp_off = c.find("More Photos")
@ -632,7 +635,10 @@ class NYTimes(BasicNewsRecipe):
c = c[:mp_off] c = c[:mp_off]
cTag.insert(0, c) cTag.insert(0, c)
caption.replaceWith(cTag) caption.replaceWith(cTag)
except:
self.log("ERROR: Problem in change captions to italic")
try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
if h1: if h1:
@ -653,7 +659,10 @@ class NYTimes(BasicNewsRecipe):
hrs = soup.findAll('hr') hrs = soup.findAll('hr')
for hr in hrs: for hr in hrs:
hr.extract() hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
if masthead: if masthead:
@ -663,18 +672,27 @@ class NYTimes(BasicNewsRecipe):
tag = Tag(soup, "h3") tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0])) tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag) masthead.replaceWith(tag)
except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
# Change <span class="bold"> to <b> # Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) : for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents: if subhead.contents:
bTag = Tag(soup, "b") bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0]) bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag: if divTag:
divTag['class'] = divTag['id'] divTag['class'] = divTag['id']
except:
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
try:
# Add class="authorId" to <div> so we can format with CSS # Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'}) divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]: if divTag and divTag.contents[0]:
@ -683,6 +701,8 @@ class NYTimes(BasicNewsRecipe):
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False))) use_alt=False)))
divTag.replaceWith(tag) divTag.replaceWith(tag)
except:
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup return soup

View File

@ -586,7 +586,7 @@ class NYTimes(BasicNewsRecipe):
return self.strip_anchors(soup) return self.strip_anchors(soup)
def postprocess_html(self,soup, True): def postprocess_html(self,soup, True):
try:
if self.one_picture_per_article: if self.one_picture_per_article:
# Remove all images after first # Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'}) largeImg = soup.find(True, {'class':'articleSpanImage'})
@ -621,10 +621,13 @@ class NYTimes(BasicNewsRecipe):
cgFirst.insert(insertLoc,firstImg) cgFirst.insert(insertLoc,firstImg)
else: else:
self.log(">>> No class:'columnGroup first' found <<<") self.log(">>> No class:'columnGroup first' found <<<")
except:
self.log("ERROR: One picture per article in postprocess_html")
try:
# Change captions to italic # Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) : for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]: if caption and len(caption) > 0:
cTag = Tag(soup, "p", [("class", "caption")]) cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos") mp_off = c.find("More Photos")
@ -632,7 +635,10 @@ class NYTimes(BasicNewsRecipe):
c = c[:mp_off] c = c[:mp_off]
cTag.insert(0, c) cTag.insert(0, c)
caption.replaceWith(cTag) caption.replaceWith(cTag)
except:
self.log("ERROR: Problem in change captions to italic")
try:
# Change <nyt_headline> to <h2> # Change <nyt_headline> to <h2>
h1 = soup.find('h1') h1 = soup.find('h1')
if h1: if h1:
@ -653,7 +659,10 @@ class NYTimes(BasicNewsRecipe):
hrs = soup.findAll('hr') hrs = soup.findAll('hr')
for hr in hrs: for hr in hrs:
hr.extract() hr.extract()
except:
self.log("ERROR: Problem in Change <nyt_headline> to <h2>")
try:
# Change <h1> to <h3> - used in editorial blogs # Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1") masthead = soup.find("h1")
if masthead: if masthead:
@ -663,18 +672,27 @@ class NYTimes(BasicNewsRecipe):
tag = Tag(soup, "h3") tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0])) tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag) masthead.replaceWith(tag)
except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
# Change <span class="bold"> to <b> # Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) : for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents: if subhead.contents:
bTag = Tag(soup, "b") bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0]) bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag) subhead.replaceWith(bTag)
except:
self.log("ERROR: Problem in Change <h1> to <h3> - used in editorial blogs")
try:
divTag = soup.find('div',attrs={'id':'articleBody'}) divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag: if divTag:
divTag['class'] = divTag['id'] divTag['class'] = divTag['id']
except:
self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})")
try:
# Add class="authorId" to <div> so we can format with CSS # Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'}) divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]: if divTag and divTag.contents[0]:
@ -683,6 +701,8 @@ class NYTimes(BasicNewsRecipe):
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False))) use_alt=False)))
divTag.replaceWith(tag) divTag.replaceWith(tag)
except:
self.log("ERROR: Problem in Add class=authorId to <div> so we can format with CSS")
return soup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):