Update NYTimes recipe

2025-07-09 03:04:10 -04:00 · 2013-03-17 09:39:22 +05:30 · 2013-03-17 09:39:22 +05:30 · 3fd23ceadd
commit 3fd23ceadd
parent a3ee07a2da
2 changed files with 149 additions and 137 deletions
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@ -41,7 +41,7 @@ class NYTimes(BasicNewsRecipe):
    # number of days old an article can be for inclusion. If oldest_web_article = None all articles
    # will be included. Note: oldest_web_article is ignored if webEdition = False
    webEdition = False
-    oldest_web_article = 7
+    oldest_web_article = None

    # download higher resolution images than the small thumbnails typically included in the article
    # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe):
                                        'relatedSearchesModule',
                                        'side_tool',
                                        'singleAd',
+                                        'postCategory column',
+                                        'refer tagRefer', # added for bits blog post
                                        'entry entry-utility', #added for DealBook
                                        'entry-tags', #added for DealBook
                                        'footer promos clearfix', #added for DealBook
@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe):
            return True
        if '/video/' in url:
            return True
+        if '/multimedia/' in url:
+            return True
        if '/slideshow/' in url:
            return True
        if '/magazine/index' in url:
@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe):
            return True
        if '/premium/' in url:
            return True
+        if '#comment' in url:
+            return True
+        if '#postComment' in url:
+            return True
+        if '#postcomment' in url:
+            return True
+        if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
+            print("NO DATE IN "+url)
+            return True
        return False

    def fixChars(self,string):
@ -363,6 +376,7 @@ class NYTimes(BasicNewsRecipe):

    cover_tag = 'NY_NYT'
    def get_cover_url(self):
+        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
@ -385,7 +399,6 @@ class NYTimes(BasicNewsRecipe):

    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'

-
    def short_title(self):
        return self.title

@ -647,64 +660,41 @@ class NYTimes(BasicNewsRecipe):

        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')

-        # Fetch the content table
-        content_table = soup.find('table',{'id':'content'})
-        if content_table is None:
-            self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
-            return None
-
-        # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
-
-        for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
-            for div_sec in td_col.findAll('div',recursive=False):
-                for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
-
-                    section_name = self.tag_to_string(h6_sec_name,use_alt=False)
-                    section_name = re.sub(r'^ *$','',section_name)
-
-                    if section_name == '':
-                        continue
-                    if self.includeSections != []:
-                        if section_name not in self.includeSections:
-                            print "SECTION NOT INCLUDED: ",section_name
-                            continue
-                    if section_name in self.excludeSections:
-                        print "SECTION EXCLUDED: ",section_name
-                        continue
-
-                    section_name=string.capwords(section_name)
-                    section_name = section_name.replace('Op-ed','Op-Ed')
-                    section_name = section_name.replace('U.s.','U.S.')
-                    section_name = section_name.replace('N.y.','N.Y.')
+        section_name='Unknown Section'
        pubdate = strftime('%a, %d %b')
-
-                    search_div = div_sec
-                    for next_tag in h6_sec_name.findNextSiblings(True):
-                        if next_tag.__class__.__name__ == 'Tag':
-                            if next_tag.name == 'div':
-                                search_div = next_tag
-                            break
-
-                    # Get the articles
-                    for h3_item in search_div.findAll('h3'):
-                        byline = h3_item.h6
-                        if byline is not None:
-                            author = self.tag_to_string(byline,use_alt=False)
-                        else:
-                            author = ''
-                        a = h3_item.find('a', href=True)
-                        if not a:
+        for td_col in soup.findAll('td'):
+            h6_sec_name = td_col.find('h6')
+            if h6_sec_name is not None:
+                new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
+                new_section_name = re.sub(r'^ *$','',new_section_name)
+                if new_section_name == '':
+                    continue
+                section_name = new_section_name
+                continue
+            atag = td_col.find('a')
+            if atag is not None:
+                h4tag = None
+                for h4tag in atag.findNextSiblings('h4'):
+                    break
+                if h4tag is None:
+                    continue
+                author = self.tag_to_string(h4tag,use_alt=False)
+                try:
+                    url = re.sub(r'\?.*', '', atag['href'])
+                except:
                    continue
-                        url = re.sub(r'\?.*', '', a['href'])
                if self.exclude_url(url):
                    continue
+                if '?' in url:
+                    url += '&pagewanted=all'
+                else:
                    url += '?pagewanted=all'
                if self.filterDuplicates:
                    if url in self.url_list:
                        continue
                self.url_list.append(url)
-                        title = self.tag_to_string(a, use_alt=True).strip()
-                        desc = h3_item.find('p')
+                title = self.tag_to_string(atag, use_alt=False).strip()
+                desc = atag.parent.find('p')
                if desc is not None:
                    description = self.tag_to_string(desc,use_alt=False)
                else:
@ -712,10 +702,11 @@ class NYTimes(BasicNewsRecipe):
                if not self.articles.has_key(section_name):
                    self.ans.append(section_name)
                    self.articles[section_name] = []
+                print('Title '+title+' author '+author)
                self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))

        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
+        return self.filter_ans(self.ans)

    def parse_index(self):
        if self.headlinesOnly:
@ -825,8 +816,9 @@ class NYTimes(BasicNewsRecipe):
            for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                if divr.find(text=re.compile('Sign up')):
                    divr.extract()
-            divr = soup.find('div',attrs={'id':re.compile('related-content')})
+            divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
            if divr is not None:
+                print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
            # handle related articles
                rlist = []
                ul = divr.find('ul')
@ -856,6 +848,8 @@ class NYTimes(BasicNewsRecipe):
                    asidediv.append(Tag(soup,'hr'))
                    smain = soup.find('body')
                    smain.append(asidediv)
+            else:
+                print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
            for atag in soup.findAll('a'):
                img = atag.find('img')
                if img is not None:
@ -898,6 +892,18 @@ class NYTimes(BasicNewsRecipe):
                                            first_outer = outerdiv
                                    else:
                                        litag.extract()
+                            for h6tag in rdiv.findAll('h6'):
+                                if h6tag.find('a') is not None:
+                                    if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
+                                        url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
+                                        h6tag.find('a')['href'] = url+'?pagewanted=all'
+                                        h6tag.extract()
+                                        related.append(h6tag)
+                                        if first_related is None:
+                                            first_related = rdiv
+                                            first_outer = outerdiv
+                                    else:
+                                        h6tag.extract()
            if related != []:
                for r in related:
                    if r.h6: # don't want the anchor inside a h6 tag
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@ -188,6 +188,8 @@ class NYTimes(BasicNewsRecipe):
                                        'relatedSearchesModule',
                                        'side_tool',
                                        'singleAd',
+                                        'postCategory column',
+                                        'refer tagRefer', # added for bits blog post
                                        'entry entry-utility', #added for DealBook
                                        'entry-tags', #added for DealBook
                                        'footer promos clearfix', #added for DealBook
@ -324,6 +326,8 @@ class NYTimes(BasicNewsRecipe):
            return True
        if '/video/' in url:
            return True
+        if '/multimedia/' in url:
+            return True
        if '/slideshow/' in url:
            return True
        if '/magazine/index' in url:
@ -334,6 +338,15 @@ class NYTimes(BasicNewsRecipe):
            return True
        if '/premium/' in url:
            return True
+        if '#comment' in url:
+            return True
+        if '#postComment' in url:
+            return True
+        if '#postcomment' in url:
+            return True
+        if re.search('/\d\d\d\d/\d\d/\d\d/',url) is None:
+            print("NO DATE IN "+url)
+            return True
        return False

    def fixChars(self,string):
@ -371,6 +384,7 @@ class NYTimes(BasicNewsRecipe):

    cover_tag = 'NY_NYT'
    def get_cover_url(self):
+        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
@ -393,7 +407,6 @@ class NYTimes(BasicNewsRecipe):

    masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'

-
    def short_title(self):
        return self.title

@ -655,64 +668,41 @@ class NYTimes(BasicNewsRecipe):

        soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')

-        # Fetch the content table
-        content_table = soup.find('table',{'id':'content'})
-        if content_table is None:
-            self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
-            return None
-
-        # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
-
-        for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
-            for div_sec in td_col.findAll('div',recursive=False):
-                for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
-
-                    section_name = self.tag_to_string(h6_sec_name,use_alt=False)
-                    section_name = re.sub(r'^ *$','',section_name)
-
-                    if section_name == '':
-                        continue
-                    if self.includeSections != []:
-                        if section_name not in self.includeSections:
-                            print "SECTION NOT INCLUDED: ",section_name
-                            continue
-                    if section_name in self.excludeSections:
-                        print "SECTION EXCLUDED: ",section_name
-                        continue
-
-                    section_name=string.capwords(section_name)
-                    section_name = section_name.replace('Op-ed','Op-Ed')
-                    section_name = section_name.replace('U.s.','U.S.')
-                    section_name = section_name.replace('N.y.','N.Y.')
+        section_name='Unknown Section'
        pubdate = strftime('%a, %d %b')
-
-                    search_div = div_sec
-                    for next_tag in h6_sec_name.findNextSiblings(True):
-                        if next_tag.__class__.__name__ == 'Tag':
-                            if next_tag.name == 'div':
-                                search_div = next_tag
-                            break
-
-                    # Get the articles
-                    for h3_item in search_div.findAll('h3'):
-                        byline = h3_item.h6
-                        if byline is not None:
-                            author = self.tag_to_string(byline,use_alt=False)
-                        else:
-                            author = ''
-                        a = h3_item.find('a', href=True)
-                        if not a:
+        for td_col in soup.findAll('td'):
+            h6_sec_name = td_col.find('h6')
+            if h6_sec_name is not None:
+                new_section_name = self.tag_to_string(h6_sec_name,use_alt=False)
+                new_section_name = re.sub(r'^ *$','',new_section_name)
+                if new_section_name == '':
+                    continue
+                section_name = new_section_name
+                continue
+            atag = td_col.find('a')
+            if atag is not None:
+                h4tag = None
+                for h4tag in atag.findNextSiblings('h4'):
+                    break
+                if h4tag is None:
+                    continue
+                author = self.tag_to_string(h4tag,use_alt=False)
+                try:
+                    url = re.sub(r'\?.*', '', atag['href'])
+                except:
                    continue
-                        url = re.sub(r'\?.*', '', a['href'])
                if self.exclude_url(url):
                    continue
+                if '?' in url:
+                    url += '&pagewanted=all'
+                else:
                    url += '?pagewanted=all'
                if self.filterDuplicates:
                    if url in self.url_list:
                        continue
                self.url_list.append(url)
-                        title = self.tag_to_string(a, use_alt=True).strip()
-                        desc = h3_item.find('p')
+                title = self.tag_to_string(atag, use_alt=False).strip()
+                desc = atag.parent.find('p')
                if desc is not None:
                    description = self.tag_to_string(desc,use_alt=False)
                else:
@ -720,10 +710,11 @@ class NYTimes(BasicNewsRecipe):
                if not self.articles.has_key(section_name):
                    self.ans.append(section_name)
                    self.articles[section_name] = []
+                print('Title '+title+' author '+author)
                self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))

        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
+        return self.filter_ans(self.ans)

    def parse_index(self):
        if self.headlinesOnly:
@ -833,8 +824,9 @@ class NYTimes(BasicNewsRecipe):
            for divr in soup.findAll('div',attrs={'class':re.compile('w190 right')}):
                if divr.find(text=re.compile('Sign up')):
                    divr.extract()
-            divr = soup.find('div',attrs={'id':re.compile('related-content')})
+            divr = soup.find('div',attrs={'class':re.compile('^relatedArticlesModule')})
            if divr is not None:
+                print("PROCESSING RELATED: "+self.tag_to_string(soup.title,False))
            # handle related articles
                rlist = []
                ul = divr.find('ul')
@ -864,6 +856,8 @@ class NYTimes(BasicNewsRecipe):
                    asidediv.append(Tag(soup,'hr'))
                    smain = soup.find('body')
                    smain.append(asidediv)
+            else:
+                print("CANNOT FIND RELATED: "+self.tag_to_string(soup.title,False))
            for atag in soup.findAll('a'):
                img = atag.find('img')
                if img is not None:
@ -906,6 +900,18 @@ class NYTimes(BasicNewsRecipe):
                                            first_outer = outerdiv
                                    else:
                                        litag.extract()
+                            for h6tag in rdiv.findAll('h6'):
+                                if h6tag.find('a') is not None:
+                                    if h6tag.find('a')['href'].startswith('http://www.nytimes.com'):
+                                        url = re.sub(r'\?.*', '', h6tag.find('a')['href'])
+                                        h6tag.find('a')['href'] = url+'?pagewanted=all'
+                                        h6tag.extract()
+                                        related.append(h6tag)
+                                        if first_related is None:
+                                            first_related = rdiv
+                                            first_outer = outerdiv
+                                    else:
+                                        h6tag.extract()
            if related != []:
                for r in related:
                    if r.h6: # don't want the anchor inside a h6 tag