Make postprocess_html in the NY Times recipes more robust

2025-07-09 03:04:10 -04:00 · 2011-01-17 13:10:10 -07:00 · 2011-01-17 13:10:10 -07:00 · 84d1dd94d2
commit 84d1dd94d2
parent aa28b37951
2 changed files with 229 additions and 189 deletions
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
        return self.strip_anchors(soup)
    def postprocess_html(self,soup, True):
 		try:
 			if self.one_picture_per_article:
 				# Remove all images after first
 				largeImg = soup.find(True, {'class':'articleSpanImage'})
 				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
 				if largeImg:
 					for inlineImg in inlineImgs:
 						inlineImg.extract()
 				else:
 					if inlineImgs:
 						firstImg = inlineImgs[0]
 						for inlineImg in inlineImgs[1:]:
 							inlineImg.extract()
 						# Move firstImg before article body
 						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
 						if cgFirst:
 							# Strip all sibling NavigableStrings: noise
 							navstrings = cgFirst.findAll(text=True, recursive=False)
 							[ns.extract() for ns in navstrings]
 							headline_found = False
 							tag = cgFirst.find(True)
 							insertLoc = 0
 							while True:
 								insertLoc += 1
 								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
 										headline_found = True
 										break
 								tag = tag.nextSibling
 								if not tag:
 									headline_found = False
 									break
 							if headline_found:
 								cgFirst.insert(insertLoc,firstImg)
 						else:
 							self.log(">>> No class:'columnGroup first' found <<<")
 		except:
 			self.log("ERROR: One picture per article in postprocess_html")
-        if self.one_picture_per_article:
+		try:
-            # Remove all images after first
+			# Change captions to italic
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
+			for caption in soup.findAll(True, {'class':'caption'}) :
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+				if caption and len(caption) > 0:
-            if largeImg:
+					cTag = Tag(soup, "p", [("class", "caption")])
-                for inlineImg in inlineImgs:
+					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-                    inlineImg.extract()
+					mp_off = c.find("More Photos")
-            else:
+					if mp_off >= 0:
-                if inlineImgs:
+						c = c[:mp_off]
-                    firstImg = inlineImgs[0]
+					cTag.insert(0, c)
-                    for inlineImg in inlineImgs[1:]:
+					caption.replaceWith(cTag)
-                        inlineImg.extract()
+		except:
-                    # Move firstImg before article body
+			self.log("ERROR:  Problem in change captions to italic")
                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                    if cgFirst:
                        # Strip all sibling NavigableStrings: noise
                        navstrings = cgFirst.findAll(text=True, recursive=False)
                        [ns.extract() for ns in navstrings]
                        headline_found = False
                        tag = cgFirst.find(True)
                        insertLoc = 0
                        while True:
                            insertLoc += 1
                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                    headline_found = True
                                    break
                            tag = tag.nextSibling
                            if not tag:
                                headline_found = False
                                break
                        if headline_found:
                            cgFirst.insert(insertLoc,firstImg)
                    else:
                        self.log(">>> No class:'columnGroup first' found <<<")
-        # Change captions to italic
+		try:
-        for caption in soup.findAll(True, {'class':'caption'}) :
+			# Change <nyt_headline> to <h2>
-            if caption and caption.contents[0]:
+			h1 = soup.find('h1')
-                cTag = Tag(soup, "p", [("class", "caption")])
+			if h1:
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+				headline = h1.find("nyt_headline")
-                mp_off = c.find("More Photos")
+				if headline:
-                if mp_off >= 0:
+					tag = Tag(soup, "h2")
-                    c = c[:mp_off]
+					tag['class'] = "headline"
-                cTag.insert(0, c)
+					tag.insert(0, self.fixChars(headline.contents[0]))
-                caption.replaceWith(cTag)
+					h1.replaceWith(tag)
 			else:
 				# Blog entry - replace headline, remove <hr> tags
 				headline = soup.find('title')
 				if headline:
 					tag = Tag(soup, "h2")
 					tag['class'] = "headline"
 					tag.insert(0, self.fixChars(headline.contents[0]))
 					soup.insert(0, tag)
 					hrs = soup.findAll('hr')
 					for hr in hrs:
 						hr.extract()
 		except:
 			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
-        # Change <nyt_headline> to <h2>
+		try:
-        h1 = soup.find('h1')
+			# Change <h1> to <h3> - used in editorial blogs
-        if h1:
+			masthead = soup.find("h1")
-            headline = h1.find("nyt_headline")
+			if masthead:
-            if headline:
+				# Nuke the href
-                tag = Tag(soup, "h2")
+				if masthead.a:
-                tag['class'] = "headline"
+					del(masthead.a['href'])
-                tag.insert(0, self.fixChars(headline.contents[0]))
+				tag = Tag(soup, "h3")
-                h1.replaceWith(tag)
+				tag.insert(0, self.fixChars(masthead.contents[0]))
-        else:
+				masthead.replaceWith(tag)
-            # Blog entry - replace headline, remove <hr> tags
+		except:
-            headline = soup.find('title')
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                soup.insert(0, tag)
                hrs = soup.findAll('hr')
                for hr in hrs:
                    hr.extract()
-        # Change <h1> to <h3> - used in editorial blogs
+		try:
-        masthead = soup.find("h1")
+			# Change <span class="bold"> to <b>
-        if masthead:
+			for subhead in soup.findAll(True, {'class':'bold'}) :
-            # Nuke the href
+				if subhead.contents:
-            if masthead.a:
+					bTag = Tag(soup, "b")
-                del(masthead.a['href'])
+					bTag.insert(0, subhead.contents[0])
-            tag = Tag(soup, "h3")
+					subhead.replaceWith(bTag)
-            tag.insert(0, self.fixChars(masthead.contents[0]))
+		except:
-            masthead.replaceWith(tag)
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
-        # Change <span class="bold"> to <b>
+		try:
-        for subhead in soup.findAll(True, {'class':'bold'}) :
+			divTag = soup.find('div',attrs={'id':'articleBody'})
-            if subhead.contents:
+			if divTag:
-                bTag = Tag(soup, "b")
+				divTag['class'] = divTag['id']
-                bTag.insert(0, subhead.contents[0])
+		except:
-                subhead.replaceWith(bTag)
+			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
-        divTag = soup.find('div',attrs={'id':'articleBody'})
+		try:
-        if divTag:
+			# Add class="authorId" to <div> so we can format with CSS
-            divTag['class'] = divTag['id']
+			divTag = soup.find('div',attrs={'id':'authorId'})
 			if divTag and divTag.contents[0]:
 				tag = Tag(soup, "p")
 				tag['class'] = "authorId"
 				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
 								 use_alt=False)))
 				divTag.replaceWith(tag)
 		except:
 			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-        # Add class="authorId" to <div> so we can format with CSS
+		return soup
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag and divTag.contents[0]:
            tag = Tag(soup, "p")
            tag['class'] = "authorId"
            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                             use_alt=False)))
            divTag.replaceWith(tag)
        return soup
    def populate_article_metadata(self, article, soup, first):
        shortparagraph = ""
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -586,105 +586,125 @@ class NYTimes(BasicNewsRecipe):
        return self.strip_anchors(soup)
    def postprocess_html(self,soup, True):
 		try:
 			if self.one_picture_per_article:
 				# Remove all images after first
 				largeImg = soup.find(True, {'class':'articleSpanImage'})
 				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
 				if largeImg:
 					for inlineImg in inlineImgs:
 						inlineImg.extract()
 				else:
 					if inlineImgs:
 						firstImg = inlineImgs[0]
 						for inlineImg in inlineImgs[1:]:
 							inlineImg.extract()
 						# Move firstImg before article body
 						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
 						if cgFirst:
 							# Strip all sibling NavigableStrings: noise
 							navstrings = cgFirst.findAll(text=True, recursive=False)
 							[ns.extract() for ns in navstrings]
 							headline_found = False
 							tag = cgFirst.find(True)
 							insertLoc = 0
 							while True:
 								insertLoc += 1
 								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
 										headline_found = True
 										break
 								tag = tag.nextSibling
 								if not tag:
 									headline_found = False
 									break
 							if headline_found:
 								cgFirst.insert(insertLoc,firstImg)
 						else:
 							self.log(">>> No class:'columnGroup first' found <<<")
 		except:
 			self.log("ERROR: One picture per article in postprocess_html")
 		try:
 			# Change captions to italic
 			for caption in soup.findAll(True, {'class':'caption'}) :
 				if caption and len(caption) > 0:
 					cTag = Tag(soup, "p", [("class", "caption")])
 					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
 					mp_off = c.find("More Photos")
 					if mp_off >= 0:
 						c = c[:mp_off]
 					cTag.insert(0, c)
 					caption.replaceWith(cTag)
 		except:
 			self.log("ERROR:  Problem in change captions to italic")
 		try:
 			# Change <nyt_headline> to <h2>
 			h1 = soup.find('h1')
 			if h1:
 				headline = h1.find("nyt_headline")
 				if headline:
 					tag = Tag(soup, "h2")
 					tag['class'] = "headline"
 					tag.insert(0, self.fixChars(headline.contents[0]))
 					h1.replaceWith(tag)
 			else:
 				# Blog entry - replace headline, remove <hr> tags
 				headline = soup.find('title')
 				if headline:
 					tag = Tag(soup, "h2")
 					tag['class'] = "headline"
 					tag.insert(0, self.fixChars(headline.contents[0]))
 					soup.insert(0, tag)
 					hrs = soup.findAll('hr')
 					for hr in hrs:
 						hr.extract()
 		except:
 			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
-        if self.one_picture_per_article:
+		try:
-            # Remove all images after first
+			# Change <h1> to <h3> - used in editorial blogs
-            largeImg = soup.find(True, {'class':'articleSpanImage'})
+			masthead = soup.find("h1")
-            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+			if masthead:
-            if largeImg:
+				# Nuke the href
-                for inlineImg in inlineImgs:
+				if masthead.a:
-                    inlineImg.extract()
+					del(masthead.a['href'])
-            else:
+				tag = Tag(soup, "h3")
-                if inlineImgs:
+				tag.insert(0, self.fixChars(masthead.contents[0]))
-                    firstImg = inlineImgs[0]
+				masthead.replaceWith(tag)
-                    for inlineImg in inlineImgs[1:]:
+		except:
-                        inlineImg.extract()
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
                    # Move firstImg before article body
                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                    if cgFirst:
                        # Strip all sibling NavigableStrings: noise
                        navstrings = cgFirst.findAll(text=True, recursive=False)
                        [ns.extract() for ns in navstrings]
                        headline_found = False
                        tag = cgFirst.find(True)
                        insertLoc = 0
                        while True:
                            insertLoc += 1
                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                    headline_found = True
                                    break
                            tag = tag.nextSibling
                            if not tag:
                                headline_found = False
                                break
                        if headline_found:
                            cgFirst.insert(insertLoc,firstImg)
                    else:
                        self.log(">>> No class:'columnGroup first' found <<<")
-        # Change captions to italic
+		try:		
-        for caption in soup.findAll(True, {'class':'caption'}) :
+			# Change <span class="bold"> to <b>
-            if caption and caption.contents[0]:
+			for subhead in soup.findAll(True, {'class':'bold'}) :
-                cTag = Tag(soup, "p", [("class", "caption")])
+				if subhead.contents:
-                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+					bTag = Tag(soup, "b")
-                mp_off = c.find("More Photos")
+					bTag.insert(0, subhead.contents[0])
-                if mp_off >= 0:
+					subhead.replaceWith(bTag)
-                    c = c[:mp_off]
+		except:
-                cTag.insert(0, c)
+			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
-                caption.replaceWith(cTag)
+		
-
+		try:		
-        # Change <nyt_headline> to <h2>
+			divTag = soup.find('div',attrs={'id':'articleBody'})
-        h1 = soup.find('h1')
+			if divTag:
-        if h1:
+				divTag['class'] = divTag['id']
-            headline = h1.find("nyt_headline")
+		except:
-            if headline:
+			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
-                tag = Tag(soup, "h2")
+			
-                tag['class'] = "headline"
+		try:	
-                tag.insert(0, self.fixChars(headline.contents[0]))
+			# Add class="authorId" to <div> so we can format with CSS
-                h1.replaceWith(tag)
+			divTag = soup.find('div',attrs={'id':'authorId'})
-        else:
+			if divTag and divTag.contents[0]:
-            # Blog entry - replace headline, remove <hr> tags
+				tag = Tag(soup, "p")
-            headline = soup.find('title')
+				tag['class'] = "authorId"
-            if headline:
+				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
-                tag = Tag(soup, "h2")
+								 use_alt=False)))
-                tag['class'] = "headline"
+				divTag.replaceWith(tag)		
-                tag.insert(0, self.fixChars(headline.contents[0]))
+		except:
-                soup.insert(0, tag)
+			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-                hrs = soup.findAll('hr')
+		
-                for hr in hrs:
+		return soup
                    hr.extract()
        # Change <h1> to <h3> - used in editorial blogs
        masthead = soup.find("h1")
        if masthead:
            # Nuke the href
            if masthead.a:
                del(masthead.a['href'])
            tag = Tag(soup, "h3")
            tag.insert(0, self.fixChars(masthead.contents[0]))
            masthead.replaceWith(tag)
        # Change <span class="bold"> to <b>
        for subhead in soup.findAll(True, {'class':'bold'}) :
            if subhead.contents:
                bTag = Tag(soup, "b")
                bTag.insert(0, subhead.contents[0])
                subhead.replaceWith(bTag)
        divTag = soup.find('div',attrs={'id':'articleBody'})
        if divTag:
            divTag['class'] = divTag['id']
        # Add class="authorId" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag and divTag.contents[0]:
            tag = Tag(soup, "p")
            tag['class'] = "authorId"
            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                             use_alt=False)))
            divTag.replaceWith(tag)
        return soup
    def populate_article_metadata(self, article, soup, first):
        shortparagraph = ""
        try: