Updated NY Times

2025-07-09 03:04:10 -04:00 · 2011-01-18 13:49:12 -07:00 · 2011-01-18 13:49:12 -07:00 · ebda738c81
commit ebda738c81
parent 54fb874621
1 changed files with 161 additions and 114 deletions
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe):
                            'relatedSearchesModule',
                            'side_tool',
                            'singleAd',
+                            'entry entry-utility', #added for DealBook
+                            'entry-tags', #added for DealBook
+                            'footer promos clearfix', #added for DealBook
+                            'footer links clearfix', #added for DealBook
+                            'inlineImage module', #added for DealBook
                            re.compile('^subNavigation'),
                            re.compile('^leaderboard'),
                            re.compile('^module'),
@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe):
                            'side_index',
                            'side_tool',
                            'toolsRight',
+                            'skybox', #added for DealBook
+                            'TopAd', #added for DealBook
+                            'related-content', #added for DealBook
                            ]),
                   dict(name=['script', 'noscript', 'style','form','hr'])]
    no_stylesheets = True
@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe):
    def exclude_url(self,url):
        if not url.startswith("http"):
            return True
-        if not url.endswith(".html"):
+        if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
            return True
        if 'nytimes.com' not in url:
            return True
@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe):


    def preprocess_html(self, soup):
-
        if self.webEdition & (self.oldest_article>0):
            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
            if date_tag:
@ -592,9 +599,12 @@ class NYTimes(BasicNewsRecipe):
                img_div = soup.find('div','inlineImage module')
                if img_div:
                    img_div.extract()
+
+
        return self.strip_anchors(soup)

    def postprocess_html(self,soup, True):
+
        try:
                if self.one_picture_per_article:
                        # Remove all images after first
@ -650,6 +660,7 @@ class NYTimes(BasicNewsRecipe):
        try:
                # Change <nyt_headline> to <h2>
                h1 = soup.find('h1')
+                blogheadline = str(h1) #added for dealbook
                if h1:
                        headline = h1.find("nyt_headline")
                        if headline:
@ -657,13 +668,19 @@ class NYTimes(BasicNewsRecipe):
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.contents[0]))
                                h1.replaceWith(tag)
+                        elif blogheadline.find('entry-title'):#added for dealbook
+                                tag = Tag(soup, "h2")#added for dealbook
+                                tag['class'] = "headline"#added for dealbook
+                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
+                                h1.replaceWith(tag)#added for dealbook
+
                else:
-				# Blog entry - replace headline, remove <hr> tags
+                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
                        headline = soup.find('title')
                        if headline:
                                tag = Tag(soup, "h2")
                                tag['class'] = "headline"
-					tag.insert(0, self.fixChars(headline.contents[0]))
+                                tag.insert(0, self.fixChars(headline.renderContents()))
                                soup.insert(0, tag)
                                hrs = soup.findAll('hr')
                                for hr in hrs:
@ -671,6 +688,29 @@ class NYTimes(BasicNewsRecipe):
        except:
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")

+        try:
+                #if this is from a blog (dealbook, fix the byline format
+                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
+                if bylineauthor:
+                    tag = Tag(soup, "h6")
+                    tag['class'] = "byline"
+                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
+                    bylineauthor.replaceWith(tag)
+        except:
+            self.log("ERROR:  fixing byline author format")
+
+        try:
+                #if this is a blog (dealbook) fix the credit style for the pictures
+                blogcredit = soup.find('div',attrs={'class':'credit'})
+                if blogcredit:
+                    tag = Tag(soup, "h6")
+                    tag['class'] = "credit"
+                    tag.insert(0, self.fixChars(blogcredit.renderContents()))
+                    blogcredit.replaceWith(tag)
+        except:
+            self.log("ERROR:  fixing credit format")
+
+
        try:
                # Change <h1> to <h3> - used in editorial blogs
                masthead = soup.find("h1")
@ -693,6 +733,13 @@ class NYTimes(BasicNewsRecipe):
                                subhead.replaceWith(bTag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
+        try:
+                #remove the <strong> update tag
+                blogupdated = soup.find('span', {'class':'update'})
+                if blogupdated:
+                    blogupdated.replaceWith("")
+        except:
+                self.log("ERROR:  Removing strong tag")

        try:
                divTag = soup.find('div',attrs={'id':'articleBody'})