From 3fd23ceadd806df64930c4799f571865ebd8359f Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 17 Mar 2013 09:39:22 +0530
Subject: [PATCH] Update NYTimes recipe

---
 recipes/nytimes.recipe     | 144 +++++++++++++++++++------------------
 recipes/nytimes_sub.recipe | 142 ++++++++++++++++++------------------
 2 files changed, 149 insertions(+), 137 deletions(-)
diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index d0f311818e..c4a4b3cee5 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -41,7 +41,7 @@ class NYTimes(BasicNewsRecipe):
     # number of days old an article can be for inclusion. If oldest_web_article = None all articles
     # will be included. Note: oldest_web_article is ignored if webEdition = False
     webEdition = False
-    oldest_web_article = 7
+    oldest_web_article = None
 
     # download higher resolution images than the small thumbnails typically included in the article
     # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
@@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe):
                                         'relatedSearchesModule',
                                         'side_tool',
                                         'singleAd',
+                                        'postCategory column',
+                                        'refer tagRefer', # added for bits blog post
                                         'entry entry-utility', #added for DealBook
                                         'entry-tags', #added for DealBook
                                         'footer promos clearfix', #added for DealBook
@@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe):
             return True
         if '/video/' in url:
             return True
+        if '/multimedia/' in url:
+            return True
         if '/slideshow/' in url:
             return True
         if '/magazine/index' in url:
@@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe):
             return True
         if '/premium/' in url:
             return True
+        if '#comment' in url:
+            return True
+        if '#postComment' in url:
+            return True
+        if '#postcomment' in url:
+            return True
+        if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
+            print("NO DATE IN "+url)
+            return True
         return False
 
     def fixChars(self,string):
@@ -363,6 +376,7 @@ class NYTimes(BasicNewsRecipe):
 
     cover_tag = 'NY_NYT'
     def get_cover_url(self):
+        from datetime import timedelta, date
         cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
         br = BasicNewsRecipe.get_browser(self)
         daysback=1
@@ -385,7 +399,6 @@ class NYTimes(BasicNewsRecipe):
 
     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
 
-
     def short_title(self):
         return self.title
 
@@ -647,75 +660,53 @@ class NYTimes(BasicNewsRecipe):
 
         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
 
-        # Fetch the content table
-        content_table = soup.find('table',{'id':'content'})
-        if content_table is None:
-            self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
-            return None
-
-        # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
-
-        for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
-            for div_sec in td_col.findAll('div',recursive=False):
-                for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
-
-                    section_name = self.tag_to_string(h6_sec_name,use_alt=False)
-                    section_name = re.sub(r'^ *$','',section_name)
-
-                    if section_name == '':
+        section_name='Unknown Section'
+        pubdate = strftime('%a, %d %b')
+        for td_col in soup.findAll('td'):
+            h6_sec_name = td_col.find('h6')
+            if h6_sec_name is not None:
+                new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
+                new_section_name = re.sub(r'^ *$','',new_section_name)
+                if new_section_name == '':
+                    continue
+                section_name = new_section_name
+                continue
+            atag = td_col.find('a')
+            if atag is not None:
+                h4tag = None
+                for h4tag in atag.findNextSiblings('h4'):
+                    break
+                if h4tag is None:
+                    continue
+                author = self.tag_to_string(h4tag,use_alt=False)
+                try:
+                    url = re.sub(r'\?.*', '', atag['href'])
+                except:
+                    continue
+                if self.exclude_url(url):
+                    continue
+                if '?' in url:
+                    url += '&pagewanted=all'
+                else:
+                    url += '?pagewanted=all'
+                if self.filterDuplicates:
+                    if url in self.url_list:
                         continue
-                    if self.includeSections != []:
-                        if section_name not in self.includeSections:
-                            print "SECTION NOT INCLUDED: ",section_name
-                            continue
-                    if section_name in self.excludeSections:
-                        print "SECTION EXCLUDED: ",section_name
-                        continue
-
-                    section_name=string.capwords(section_name)
-                    section_name = section_name.replace('Op-ed','Op-Ed')
-                    section_name = section_name.replace('U.s.','U.S.')
-                    section_name = section_name.replace('N.y.','N.Y.')
-                    pubdate = strftime('%a, %d %b')
-
-                    search_div = div_sec
-                    for next_tag in h6_sec_name.findNextSiblings(True):
-                        if next_tag.__class__.__name__ == 'Tag':
-                            if next_tag.name == 'div':
-                                search_div = next_tag
-                            break
-
-                    # Get the articles
-                    for h3_item in search_div.findAll('h3'):
-                        byline = h3_item.h6
-                        if byline is not None:
-                            author = self.tag_to_string(byline,use_alt=False)
-                        else:
-                            author = ''
-                        a = h3_item.find('a', href=True)
-                        if not a:
-                            continue
-                        url = re.sub(r'\?.*', '', a['href'])
-                        if self.exclude_url(url):
-                            continue
-                        url += '?pagewanted=all'
-                        if self.filterDuplicates:
-                            if url in self.url_list:
-                                continue
-                        self.url_list.append(url)
-                        title = self.tag_to_string(a, use_alt=True).strip()
-                        desc = h3_item.find('p')
-                        if desc is not None:
-                            description = self.tag_to_string(desc,use_alt=False)
-                        else:
-                            description = ''
-                        if not self.articles.has_key(section_name):
-                            self.ans.append(section_name)
-                            self.articles[section_name] = []
-                        self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
+                self.url_list.append(url)
+                title = self.tag_to_string(atag, use_alt=False).strip()
+                desc = atag.parent.find('p')
+                if desc is not None:
+                    description = self.tag_to_string(desc,use_alt=False)
+                else:
+                    description = ''
+                if not self.articles.has_key(section_name):
+                    self.ans.append(section_name)
+                    self.articles[section_name] = []
+                print('Title '+title+' author '+author)
+                self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
+        return self.filter_ans(self.ans)
 
     def parse_index(self):
         if self.headlinesOnly:
@@ -825,8 +816,9 @@ class NYTimes(BasicNewsRecipe):
             for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                 if divr.find(text=re.compile('Sign up')):
                     divr.extract()
-            divr = soup.find('div',attrs={'id':re.compile('related-content')})
+            divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
             if divr is not None:
+                print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
             # handle related articles
                 rlist = []
                 ul = divr.find('ul')
@@ -856,6 +848,8 @@ class NYTimes(BasicNewsRecipe):
                     asidediv.append(Tag(soup,'hr'))
                     smain = soup.find('body')
                     smain.append(asidediv)
+            else:
+                print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
             for atag in soup.findAll('a'):
                 img = atag.find('img')
                 if img is not None:
@@ -898,6 +892,18 @@ class NYTimes(BasicNewsRecipe):
                                             first_outer = outerdiv
                                     else:
                                         litag.extract()
+                            for h6tag in rdiv.findAll('h6'):
+                                if h6tag.find('a') is not None:
+                                    if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
+                                        url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
+                                        h6tag.find('a')['href'] = url+'?pagewanted=all'
+                                        h6tag.extract()
+                                        related.append(h6tag)
+                                        if first_related is None:
+                                            first_related = rdiv
+                                            first_outer = outerdiv
+                                    else:
+                                        h6tag.extract()
             if related != []:
                 for r in related:
                     if r.h6: # don't want the anchor inside a h6 tag
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index 06c476ef19..2dba2d505d 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe):
                                         'relatedSearchesModule',
                                         'side_tool',
                                         'singleAd',
+                                        'postCategory column',
+                                        'refer tagRefer', # added for bits blog post
                                         'entry entry-utility', #added for DealBook
                                         'entry-tags', #added for DealBook
                                         'footer promos clearfix', #added for DealBook
@@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe):
             return True
         if '/video/' in url:
             return True
+        if '/multimedia/' in url:
+            return True
         if '/slideshow/' in url:
             return True
         if '/magazine/index' in url:
@@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe):
             return True
         if '/premium/' in url:
             return True
+        if '#comment' in url:
+            return True
+        if '#postComment' in url:
+            return True
+        if '#postcomment' in url:
+            return True
+        if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
+            print("NO DATE IN "+url)
+            return True
         return False
 
     def fixChars(self,string):
@@ -371,6 +384,7 @@ class NYTimes(BasicNewsRecipe):
 
     cover_tag = 'NY_NYT'
     def get_cover_url(self):
+        from datetime import timedelta, date
         cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
         br = BasicNewsRecipe.get_browser(self)
         daysback=1
@@ -393,7 +407,6 @@ class NYTimes(BasicNewsRecipe):
 
     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
 
-
     def short_title(self):
         return self.title
 
@@ -655,75 +668,53 @@ class NYTimes(BasicNewsRecipe):
 
         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
 
-        # Fetch the content table
-        content_table = soup.find('table',{'id':'content'})
-        if content_table is None:
-            self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
-            return None
-
-        # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
-
-        for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
-            for div_sec in td_col.findAll('div',recursive=False):
-                for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
-
-                    section_name = self.tag_to_string(h6_sec_name,use_alt=False)
-                    section_name = re.sub(r'^ *$','',section_name)
-
-                    if section_name == '':
+        section_name='Unknown Section'
+        pubdate = strftime('%a, %d %b')
+        for td_col in soup.findAll('td'):
+            h6_sec_name = td_col.find('h6')
+            if h6_sec_name is not None:
+                new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
+                new_section_name = re.sub(r'^ *$','',new_section_name)
+                if new_section_name == '':
+                    continue
+                section_name = new_section_name
+                continue
+            atag = td_col.find('a')
+            if atag is not None:
+                h4tag = None
+                for h4tag in atag.findNextSiblings('h4'):
+                    break
+                if h4tag is None:
+                    continue
+                author = self.tag_to_string(h4tag,use_alt=False)
+                try:
+                    url = re.sub(r'\?.*', '', atag['href'])
+                except:
+                    continue
+                if self.exclude_url(url):
+                    continue
+                if '?' in url:
+                    url += '&pagewanted=all'
+                else:
+                    url += '?pagewanted=all'
+                if self.filterDuplicates:
+                    if url in self.url_list:
                         continue
-                    if self.includeSections != []:
-                        if section_name not in self.includeSections:
-                            print "SECTION NOT INCLUDED: ",section_name
-                            continue
-                    if section_name in self.excludeSections:
-                        print "SECTION EXCLUDED: ",section_name
-                        continue
-
-                    section_name=string.capwords(section_name)
-                    section_name = section_name.replace('Op-ed','Op-Ed')
-                    section_name = section_name.replace('U.s.','U.S.')
-                    section_name = section_name.replace('N.y.','N.Y.')
-                    pubdate = strftime('%a, %d %b')
-
-                    search_div = div_sec
-                    for next_tag in h6_sec_name.findNextSiblings(True):
-                        if next_tag.__class__.__name__ == 'Tag':
-                            if next_tag.name == 'div':
-                                search_div = next_tag
-                            break
-
-                    # Get the articles
-                    for h3_item in search_div.findAll('h3'):
-                        byline = h3_item.h6
-                        if byline is not None:
-                            author = self.tag_to_string(byline,use_alt=False)
-                        else:
-                            author = ''
-                        a = h3_item.find('a', href=True)
-                        if not a:
-                            continue
-                        url = re.sub(r'\?.*', '', a['href'])
-                        if self.exclude_url(url):
-                            continue
-                        url += '?pagewanted=all'
-                        if self.filterDuplicates:
-                            if url in self.url_list:
-                                continue
-                        self.url_list.append(url)
-                        title = self.tag_to_string(a, use_alt=True).strip()
-                        desc = h3_item.find('p')
-                        if desc is not None:
-                            description = self.tag_to_string(desc,use_alt=False)
-                        else:
-                            description = ''
-                        if not self.articles.has_key(section_name):
-                            self.ans.append(section_name)
-                            self.articles[section_name] = []
-                        self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
+                self.url_list.append(url)
+                title = self.tag_to_string(atag, use_alt=False).strip()
+                desc = atag.parent.find('p')
+                if desc is not None:
+                    description = self.tag_to_string(desc,use_alt=False)
+                else:
+                    description = ''
+                if not self.articles.has_key(section_name):
+                    self.ans.append(section_name)
+                    self.articles[section_name] = []
+                print('Title '+title+' author '+author)
+                self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
+        return self.filter_ans(self.ans)
 
     def parse_index(self):
         if self.headlinesOnly:
@@ -833,8 +824,9 @@ class NYTimes(BasicNewsRecipe):
             for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                 if divr.find(text=re.compile('Sign up')):
                     divr.extract()
-            divr = soup.find('div',attrs={'id':re.compile('related-content')})
+            divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
             if divr is not None:
+                print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
             # handle related articles
                 rlist = []
                 ul = divr.find('ul')
@@ -864,6 +856,8 @@ class NYTimes(BasicNewsRecipe):
                     asidediv.append(Tag(soup,'hr'))
                     smain = soup.find('body')
                     smain.append(asidediv)
+            else:
+                print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
             for atag in soup.findAll('a'):
                 img = atag.find('img')
                 if img is not None:
@@ -906,6 +900,18 @@ class NYTimes(BasicNewsRecipe):
                                             first_outer = outerdiv
                                     else:
                                         litag.extract()
+                            for h6tag in rdiv.findAll('h6'):
+                                if h6tag.find('a') is not None:
+                                    if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
+                                        url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
+                                        h6tag.find('a')['href'] = url+'?pagewanted=all'
+                                        h6tag.extract()
+                                        related.append(h6tag)
+                                        if first_related is None:
+                                            first_related = rdiv
+                                            first_outer = outerdiv
+                                    else:
+                                        h6tag.extract()
             if related != []:
                 for r in related:
                     if r.h6: # don't want the anchor inside a h6 tag