Updated NY Times

2026-03-31 14:22:30 -04:00 · 2011-01-18 13:49:12 -07:00 · 2011-01-18 13:49:12 -07:00 · ebda738c81
commit ebda738c81
parent 54fb874621
1 changed files with 161 additions and 114 deletions
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
+                            'entry entry-utility', #added for DealBook
+                            'entry-tags', #added for DealBook
+                            'footer promos clearfix', #added for DealBook
+                            'footer links clearfix', #added for DealBook
+                            'inlineImage module', #added for DealBook
                            re.compile('^subNavigation'),
                            re.compile('^leaderboard'),
                            re.compile('^module'),
@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
                            'side_index',
                            'side_tool',
                            'toolsRight',
+                            'skybox', #added for DealBook
+                            'TopAd', #added for DealBook
+                            'related-content', #added for DealBook
                            ]),
                   dict(name=['script', 'noscript', 'style','form','hr'])]
    no_stylesheets = True
@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html"):
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):


    def preprocess_html(self, soup):
-
        if self.webEdition & (self.oldest_article>0):
            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
            if date_tag:
@ -592,128 +599,168 @@ class NYTimes(BasicNewsRecipe):
                img_div = soup.find('div','inlineImage module')
                if img_div:
                    img_div.extract()
+
+
        return self.strip_anchors(soup)

    def postprocess_html(self,soup, True):
-		try:
-			if self.one_picture_per_article:
-				# Remove all images after first
-				largeImg = soup.find(True, {'class':'articleSpanImage'})
-				inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
-				if largeImg:
-					for inlineImg in inlineImgs:
-						inlineImg.extract()
-				else:
-					if inlineImgs:
-						firstImg = inlineImgs[0]
-						for inlineImg in inlineImgs[1:]:
-							inlineImg.extract()
-						# Move firstImg before article body
-						cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
-						if cgFirst:
-							# Strip all sibling NavigableStrings: noise
-							navstrings = cgFirst.findAll(text=True, recursive=False)
-							[ns.extract() for ns in navstrings]
-							headline_found = False
-							tag = cgFirst.find(True)
-							insertLoc = 0
-							while True:
-								insertLoc += 1
-								if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
-										headline_found = True
-										break
-								tag = tag.nextSibling
-								if not tag:
-									headline_found = False
-									break
-							if headline_found:
-								cgFirst.insert(insertLoc,firstImg)
-						else:
-							self.log(">>> No class:'columnGroup first' found <<<")
-		except:
-			self.log("ERROR: One picture per article in postprocess_html")

-		try:
-			# Change captions to italic
-			for caption in soup.findAll(True, {'class':'caption'}) :
-				if caption and len(caption) > 0:
-					cTag = Tag(soup, "p", [("class", "caption")])
-					c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
-					mp_off = c.find("More Photos")
-					if mp_off >= 0:
-						c = c[:mp_off]
-					cTag.insert(0, c)
-					caption.replaceWith(cTag)
-		except:
-			self.log("ERROR:  Problem in change captions to italic")
+        try:
+                if self.one_picture_per_article:
+                        # Remove all images after first
+                        largeImg = soup.find(True, {'class':'articleSpanImage'})
+                        inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+                        if largeImg:
+                                for inlineImg in inlineImgs:
+                                        inlineImg.extract()
+                        else:
+                                if inlineImgs:
+                                        firstImg = inlineImgs[0]
+                                        for inlineImg in inlineImgs[1:]:
+                                                inlineImg.extract()
+                                        # Move firstImg before article body
+                                        cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
+                                        if cgFirst:
+                                                # Strip all sibling NavigableStrings: noise
+                                                navstrings = cgFirst.findAll(text=True, recursive=False)
+                                                [ns.extract() for ns in navstrings]
+                                                headline_found = False
+                                                tag = cgFirst.find(True)
+                                                insertLoc = 0
+                                                while True:
+                                                        insertLoc += 1
+                                                        if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+                                                                        headline_found = True
+                                                                        break
+                                                        tag = tag.nextSibling
+                                                        if not tag:
+                                                                headline_found = False
+                                                                break
+                                                if headline_found:
+                                                        cgFirst.insert(insertLoc,firstImg)
+                                        else:
+                                                self.log(">>> No class:'columnGroup first' found <<<")
+        except:
+                self.log("ERROR: One picture per article in postprocess_html")

-		try:
-			# Change <nyt_headline> to <h2>
-			h1 = soup.find('h1')
-			if h1:
-				headline = h1.find("nyt_headline")
-				if headline:
-					tag = Tag(soup, "h2")
-					tag['class'] = "headline"
-					tag.insert(0, self.fixChars(headline.contents[0]))
-					h1.replaceWith(tag)
-			else:
-				# Blog entry - replace headline, remove <hr> tags
-				headline = soup.find('title')
-				if headline:
-					tag = Tag(soup, "h2")
-					tag['class'] = "headline"
-					tag.insert(0, self.fixChars(headline.contents[0]))
-					soup.insert(0, tag)
-					hrs = soup.findAll('hr')
-					for hr in hrs:
-						hr.extract()
-		except:
-			self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
+        try:
+                # Change captions to italic
+                for caption in soup.findAll(True, {'class':'caption'}) :
+                        if caption and len(caption) > 0:
+                                cTag = Tag(soup, "p", [("class", "caption")])
+                                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+                                mp_off = c.find("More Photos")
+                                if mp_off >= 0:
+                                        c = c[:mp_off]
+                                cTag.insert(0, c)
+                                caption.replaceWith(cTag)
+        except:
+                self.log("ERROR:  Problem in change captions to italic")

-		try:
-			# Change <h1> to <h3> - used in editorial blogs
-			masthead = soup.find("h1")
-			if masthead:
-				# Nuke the href
-				if masthead.a:
-					del(masthead.a['href'])
-				tag = Tag(soup, "h3")
-				tag.insert(0, self.fixChars(masthead.contents[0]))
-				masthead.replaceWith(tag)
-		except:
-			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+        try:
+                # Change <nyt_headline> to <h2>
+                h1 = soup.find('h1')
+                blogheadline = str(h1) #added for dealbook
+                if h1:
+                        headline = h1.find("nyt_headline")
+                        if headline:
+                                tag = Tag(soup, "h2")
+                                tag['class'] = "headline"
+                                tag.insert(0, self.fixChars(headline.contents[0]))
+                                h1.replaceWith(tag)
+                        elif blogheadline.find('entry-title'):#added for dealbook
+                                tag = Tag(soup, "h2")#added for dealbook
+                                tag['class'] = "headline"#added for dealbook
+                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
+                                h1.replaceWith(tag)#added for dealbook

-		try:
-			# Change <span class="bold"> to <b>
-			for subhead in soup.findAll(True, {'class':'bold'}) :
-				if subhead.contents:
-					bTag = Tag(soup, "b")
-					bTag.insert(0, subhead.contents[0])
-					subhead.replaceWith(bTag)
-		except:
-			self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+                else:
+                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
+                        headline = soup.find('title')
+                        if headline:
+                                tag = Tag(soup, "h2")
+                                tag['class'] = "headline"
+                                tag.insert(0, self.fixChars(headline.renderContents()))
+                                soup.insert(0, tag)
+                                hrs = soup.findAll('hr')
+                                for hr in hrs:
+                                        hr.extract()
+        except:
+                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")

-		try:
-			divTag = soup.find('div',attrs={'id':'articleBody'})
-			if divTag:
-				divTag['class'] = divTag['id']
-		except:
-			self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
+        try:
+                #if this is from a blog (dealbook, fix the byline format
+                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
+                if bylineauthor:
+                    tag = Tag(soup, "h6")
+                    tag['class'] = "byline"
+                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
+                    bylineauthor.replaceWith(tag)
+        except:
+            self.log("ERROR:  fixing byline author format")

-		try:
-			# Add class="authorId" to <div> so we can format with CSS
-			divTag = soup.find('div',attrs={'id':'authorId'})
-			if divTag and divTag.contents[0]:
-				tag = Tag(soup, "p")
-				tag['class'] = "authorId"
-				tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
-								 use_alt=False)))
-				divTag.replaceWith(tag)
-		except:
-			self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
+        try:
+                #if this is a blog (dealbook) fix the credit style for the pictures
+                blogcredit = soup.find('div',attrs={'class':'credit'})
+                if blogcredit:
+                    tag = Tag(soup, "h6")
+                    tag['class'] = "credit"
+                    tag.insert(0, self.fixChars(blogcredit.renderContents()))
+                    blogcredit.replaceWith(tag)
+        except:
+            self.log("ERROR:  fixing credit format")

-		return soup
+
+        try:
+                # Change <h1> to <h3> - used in editorial blogs
+                masthead = soup.find("h1")
+                if masthead:
+                        # Nuke the href
+                        if masthead.a:
+                                del(masthead.a['href'])
+                        tag = Tag(soup, "h3")
+                        tag.insert(0, self.fixChars(masthead.contents[0]))
+                        masthead.replaceWith(tag)
+        except:
+                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+
+        try:
+                # Change <span class="bold"> to <b>
+                for subhead in soup.findAll(True, {'class':'bold'}) :
+                        if subhead.contents:
+                                bTag = Tag(soup, "b")
+                                bTag.insert(0, subhead.contents[0])
+                                subhead.replaceWith(bTag)
+        except:
+                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+        try:
+                #remove the <strong> update tag
+                blogupdated = soup.find('span', {'class':'update'})
+                if blogupdated:
+                    blogupdated.replaceWith("")
+        except:
+                self.log("ERROR:  Removing strong tag")
+
+        try:
+                divTag = soup.find('div',attrs={'id':'articleBody'})
+                if divTag:
+                        divTag['class'] = divTag['id']
+        except:
+                self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
+
+        try:
+                # Add class="authorId" to <div> so we can format with CSS
+                divTag = soup.find('div',attrs={'id':'authorId'})
+                if divTag and divTag.contents[0]:
+                        tag = Tag(soup, "p")
+                        tag['class'] = "authorId"
+                        tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+                                                         use_alt=False)))
+                        divTag.replaceWith(tag)
+        except:
+                self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
+
+        return soup
    def populate_article_metadata(self, article, soup, first):
        shortparagraph = ""
        try: