Updated NY Times

2025-07-09 03:04:10 -04:00 · 2011-01-18 13:49:12 -07:00 · 2011-01-18 13:49:12 -07:00 · ebda738c81
commit ebda738c81
parent 54fb874621
1 changed files with 161 additions and 114 deletions
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
                            'entry entry-utility', #added for DealBook
                            'entry-tags', #added for DealBook
                            'footer promos clearfix', #added for DealBook
                            'footer links clearfix', #added for DealBook
                            'inlineImage module', #added for DealBook
                            re.compile('^subNavigation'),
                            re.compile('^leaderboard'),
                            re.compile('^module'),
@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
                            'side_index',
                            'side_tool',
                            'toolsRight',
                            'skybox', #added for DealBook
                            'TopAd', #added for DealBook
                            'related-content', #added for DealBook
                            ]),
                   dict(name=['script', 'noscript', 'style','form','hr'])]
    no_stylesheets = True
@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html"):
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):
    def preprocess_html(self, soup):
        if self.webEdition & (self.oldest_article>0):
            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
            if date_tag:
@ -592,128 +599,168 @@ class NYTimes(BasicNewsRecipe):
                img_div = soup.find('div','inlineImage module')
                if img_div:
                    img_div.extract()
        return self.strip_anchors(soup)
    def postprocess_html(self,soup, True):
 		try:
 			if self.one_picture_per_article:
 				# Remove all images after first
 				largeImg = soup.find(True, {'class':'articleSpanImage'})
 				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
 				if largeImg:
 					for inlineImg in inlineImgs:
 						inlineImg.extract()
 				else:
 					if inlineImgs:
 						firstImg = inlineImgs[0]
 						for inlineImg in inlineImgs[1:]:
 							inlineImg.extract()
 						# Move firstImg before article body
 						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
 						if cgFirst:
 							# Strip all sibling NavigableStrings: noise
 							navstrings = cgFirst.findAll(text=True, recursive=False)
 							[ns.extract() for ns in navstrings]
 							headline_found = False
 							tag = cgFirst.find(True)
 							insertLoc = 0
 							while True:
 								insertLoc += 1
 								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
 										headline_found = True
 										break
 								tag = tag.nextSibling
 								if not tag:
 									headline_found = False
 									break
 							if headline_found:
 								cgFirst.insert(insertLoc,firstImg)
 						else:
 							self.log(">>> No class:'columnGroup first' found <<<")
 		except:
 			self.log("ERROR: One picture per article in postprocess_html")
-		try:
+        try:
-			# Change captions to italic
+                if self.one_picture_per_article:
-			for caption in soup.findAll(True, {'class':'caption'}) :
+                        # Remove all images after first
-				if caption and len(caption) > 0:
+                        largeImg = soup.find(True, {'class':'articleSpanImage'})
-					cTag = Tag(soup, "p", [("class", "caption")])
+                        inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+                        if largeImg:
-					mp_off = c.find("More Photos")
+                                for inlineImg in inlineImgs:
-					if mp_off >= 0:
+                                        inlineImg.extract()
-						c = c[:mp_off]
+                        else:
-					cTag.insert(0, c)
+                                if inlineImgs:
-					caption.replaceWith(cTag)
+                                        firstImg = inlineImgs[0]
-		except:
+                                        for inlineImg in inlineImgs[1:]:
-			self.log("ERROR:  Problem in change captions to italic")
+                                                inlineImg.extract()
                                        # Move firstImg before article body
                                        cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                                        if cgFirst:
                                                # Strip all sibling NavigableStrings: noise
                                                navstrings = cgFirst.findAll(text=True, recursive=False)
                                                [ns.extract() for ns in navstrings]
                                                headline_found = False
                                                tag = cgFirst.find(True)
                                                insertLoc = 0
                                                while True:
                                                        insertLoc += 1
                                                        if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                                                        headline_found = True
                                                                        break
                                                        tag = tag.nextSibling
                                                        if not tag:
                                                                headline_found = False
                                                                break
                                                if headline_found:
                                                        cgFirst.insert(insertLoc,firstImg)
                                        else:
                                                self.log(">>> No class:'columnGroup first' found <<<")
        except:
                self.log("ERROR: One picture per article in postprocess_html")
-		try:
+        try:
-			# Change <nyt_headline> to <h2>
+                # Change captions to italic
-			h1 = soup.find('h1')
+                for caption in soup.findAll(True, {'class':'caption'}) :
-			if h1:
+                        if caption and len(caption) > 0:
-				headline = h1.find("nyt_headline")
+                                cTag = Tag(soup, "p", [("class", "caption")])
-				if headline:
+                                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-					tag = Tag(soup, "h2")
+                                mp_off = c.find("More Photos")
-					tag['class'] = "headline"
+                                if mp_off >= 0:
-					tag.insert(0, self.fixChars(headline.contents[0]))
+                                        c = c[:mp_off]
-					h1.replaceWith(tag)
+                                cTag.insert(0, c)
-			else:
+                                caption.replaceWith(cTag)
-				# Blog entry - replace headline, remove <hr> tags
+        except:
-				headline = soup.find('title')
+                self.log("ERROR:  Problem in change captions to italic")
 				if headline:
 					tag = Tag(soup, "h2")
 					tag['class'] = "headline"
 					tag.insert(0, self.fixChars(headline.contents[0]))
 					soup.insert(0, tag)
 					hrs = soup.findAll('hr')
 					for hr in hrs:
 						hr.extract()
 		except:
 			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
-		try:
+        try:
-			# Change <h1> to <h3> - used in editorial blogs
+                # Change <nyt_headline> to <h2>
-			masthead = soup.find("h1")
+                h1 = soup.find('h1')
-			if masthead:
+                blogheadline = str(h1) #added for dealbook
-				# Nuke the href
+                if h1:
-				if masthead.a:
+                        headline = h1.find("nyt_headline")
-					del(masthead.a['href'])
+                        if headline:
-				tag = Tag(soup, "h3")
+                                tag = Tag(soup, "h2")
-				tag.insert(0, self.fixChars(masthead.contents[0]))
+                                tag['class'] = "headline"
-				masthead.replaceWith(tag)
+                                tag.insert(0, self.fixChars(headline.contents[0]))
-		except:
+                                h1.replaceWith(tag)
-			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+                        elif blogheadline.find('entry-title'):#added for dealbook
                                tag = Tag(soup, "h2")#added for dealbook
                                tag['class'] = "headline"#added for dealbook
                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
                                h1.replaceWith(tag)#added for dealbook
-		try:
+                else:
-			# Change <span class="bold"> to <b>
+                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
-			for subhead in soup.findAll(True, {'class':'bold'}) :
+                        headline = soup.find('title')
-				if subhead.contents:
+                        if headline:
-					bTag = Tag(soup, "b")
+                                tag = Tag(soup, "h2")
-					bTag.insert(0, subhead.contents[0])
+                                tag['class'] = "headline"
-					subhead.replaceWith(bTag)
+                                tag.insert(0, self.fixChars(headline.renderContents()))
-		except:
+                                soup.insert(0, tag)
-			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+                                hrs = soup.findAll('hr')
                                for hr in hrs:
                                        hr.extract()
        except:
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
-		try:
+        try:
-			divTag = soup.find('div',attrs={'id':'articleBody'})
+                #if this is from a blog (dealbook, fix the byline format
-			if divTag:
+                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
-				divTag['class'] = divTag['id']
+                if bylineauthor:
-		except:
+                    tag = Tag(soup, "h6")
-			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
+                    tag['class'] = "byline"
                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
                    bylineauthor.replaceWith(tag)
        except:
            self.log("ERROR:  fixing byline author format")
-		try:
+        try:
-			# Add class="authorId" to <div> so we can format with CSS
+                #if this is a blog (dealbook) fix the credit style for the pictures
-			divTag = soup.find('div',attrs={'id':'authorId'})
+                blogcredit = soup.find('div',attrs={'class':'credit'})
-			if divTag and divTag.contents[0]:
+                if blogcredit:
-				tag = Tag(soup, "p")
+                    tag = Tag(soup, "h6")
-				tag['class'] = "authorId"
+                    tag['class'] = "credit"
-				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+                    tag.insert(0, self.fixChars(blogcredit.renderContents()))
-								 use_alt=False)))
+                    blogcredit.replaceWith(tag)
-				divTag.replaceWith(tag)
+        except:
-		except:
+            self.log("ERROR:  fixing credit format")
 			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-		return soup
+
        try:
                # Change <h1> to <h3> - used in editorial blogs
                masthead = soup.find("h1")
                if masthead:
                        # Nuke the href
                        if masthead.a:
                                del(masthead.a['href'])
                        tag = Tag(soup, "h3")
                        tag.insert(0, self.fixChars(masthead.contents[0]))
                        masthead.replaceWith(tag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
        try:
                # Change <span class="bold"> to <b>
                for subhead in soup.findAll(True, {'class':'bold'}) :
                        if subhead.contents:
                                bTag = Tag(soup, "b")
                                bTag.insert(0, subhead.contents[0])
                                subhead.replaceWith(bTag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
        try:
                #remove the <strong> update tag
                blogupdated = soup.find('span', {'class':'update'})
                if blogupdated:
                    blogupdated.replaceWith("")
        except:
                self.log("ERROR:  Removing strong tag")
        try:
                divTag = soup.find('div',attrs={'id':'articleBody'})
                if divTag:
                        divTag['class'] = divTag['id']
        except:
                self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
        try:
                # Add class="authorId" to <div> so we can format with CSS
                divTag = soup.find('div',attrs={'id':'authorId'})
                if divTag and divTag.contents[0]:
                        tag = Tag(soup, "p")
                        tag['class'] = "authorId"
                        tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                                                         use_alt=False)))
                        divTag.replaceWith(tag)
        except:
                self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
        return soup
    def populate_article_metadata(self, article, soup, first):
        shortparagraph = ""
        try: