From f77765ff3c458819ac8c0ae696a46012b5b70b3c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 1 Jan 2013 09:52:44 +0530
Subject: [PATCH] Update NY Times

---
 recipes/nytimes.recipe     | 83 +++++++++++++++++++++++++++++++++++---
 recipes/nytimes_sub.recipe | 83 +++++++++++++++++++++++++++++++++++---
 2 files changed, 156 insertions(+), 10 deletions(-)

diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe
index ba97a2c0be..f5b994275e 100644
--- a/recipes/nytimes.recipe
+++ b/recipes/nytimes.recipe
@@ -15,6 +15,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
 class NYTimes(BasicNewsRecipe):
 
     recursions=1 # set this to zero to omit Related articles lists
+    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
 
     # set getTechBlogs to True to include the technology blogs
     # set tech_oldest_article to control article age
@@ -24,6 +25,14 @@ class NYTimes(BasicNewsRecipe):
     tech_oldest_article = 14
     tech_max_articles_per_feed = 25
 
+    # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
+    # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
+    getPopularArticles = True
+    popularPeriod = '1' # set this to the number of days to include in the measurement
+                        # e.g. 7 will get the most popular measured over the last 7 days
+                        # and 30 will get the most popular measured over 30 days.
+                        # you still only get up to 20 articles in each category
+
 
     # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
     headlinesOnly = True
@@ -376,6 +385,7 @@ class NYTimes(BasicNewsRecipe):
 
     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
 
+
     def short_title(self):
         return self.title
 
@@ -384,6 +394,7 @@ class NYTimes(BasicNewsRecipe):
         from contextlib import closing
         import copy
         from calibre.ebooks.chardet import xml_to_unicode
+        print("ARTICLE_TO_SOUP "+url_or_raw)
         if re.match(r'\w+://', url_or_raw):
             br = self.clone_browser(self.browser)
             open_func = getattr(br, 'open_novisit', br.open)
@@ -475,6 +486,67 @@ class NYTimes(BasicNewsRecipe):
                             description=description, author=author,
                             content=''))
 
+    def get_popular_articles(self,ans):
+        if self.getPopularArticles:
+            popular_articles = {}
+            key_list = []
+
+            def handleh3(h3tag):
+                try:
+                    url = h3tag.a['href']
+                except:
+                    return ('','','','')
+                url = re.sub(r'\?.*', '', url)
+                if self.exclude_url(url):
+                    return ('','','','')
+                url += '?pagewanted=all'
+                title = self.tag_to_string(h3tag.a,False)
+                h6tag = h3tag.findNextSibling('h6')
+                if h6tag is not None:
+                    author = self.tag_to_string(h6tag,False)
+                else:
+                    author = ''
+                ptag = h3tag.findNextSibling('p')
+                if ptag is not None:
+                    desc = self.tag_to_string(ptag,False)
+                else:
+                    desc = ''
+                return(title,url,author,desc)
+
+
+            have_emailed = False
+            emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
+            for h3tag in emailed_soup.findAll('h3'):
+                (title,url,author,desc) = handleh3(h3tag)
+                if url=='':
+                    continue
+                if not have_emailed:
+                    key_list.append('Most E-Mailed')
+                    popular_articles['Most E-Mailed'] = []
+                    have_emailed = True
+                popular_articles['Most E-Mailed'].append(
+                    dict(title=title, url=url, date=strftime('%a, %d %b'),
+                        description=desc, author=author,
+                        content=''))
+            have_viewed = False
+            viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod)
+            for h3tag in viewed_soup.findAll('h3'):
+                (title,url,author,desc) = handleh3(h3tag)
+                if url=='':
+                    continue
+                if not have_viewed:
+                    key_list.append('Most Viewed')
+                    popular_articles['Most Viewed'] = []
+                    have_viewed = True
+                popular_articles['Most Viewed'].append(
+                    dict(title=title, url=url, date=strftime('%a, %d %b'),
+                        description=desc, author=author,
+                        content=''))
+            viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
+            for x in viewed_ans:
+                ans.append(x)
+        return ans
+
     def get_tech_feeds(self,ans):
         if self.getTechBlogs:
             tech_articles = {}
@@ -536,7 +608,7 @@ class NYTimes(BasicNewsRecipe):
                         self.handle_article(lidiv)
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.ans))
+        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
 
     def parse_todays_index(self):
@@ -569,7 +641,7 @@ class NYTimes(BasicNewsRecipe):
                         self.handle_article(lidiv)
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.ans))
+        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
     def parse_headline_index(self):
 
@@ -643,7 +715,7 @@ class NYTimes(BasicNewsRecipe):
                         self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.ans))
+        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
     def parse_index(self):
         if self.headlinesOnly:
@@ -731,7 +803,7 @@ class NYTimes(BasicNewsRecipe):
 
 
     def preprocess_html(self, soup):
-        #print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
+        #print(strftime("%H:%M:%S")+" --  PREPROCESS TITLE="+self.tag_to_string(soup.title))
         skip_tag = soup.find(True, {'name':'skip'})
         if skip_tag is not None:
             #url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
@@ -907,6 +979,7 @@ class NYTimes(BasicNewsRecipe):
             for aside in soup.findAll('div','aside'):
                 aside.extract()
             soup = self.strip_anchors(soup,True)
+            #print("RECURSIVE: "+self.tag_to_string(soup.title))
 
         if soup.find('div',attrs={'id':'blogcontent'}) is None:
             if first_fetch:
@@ -1071,7 +1144,7 @@ class NYTimes(BasicNewsRecipe):
                         divTag.replaceWith(tag)
             except:
                 self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-
+        #print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title))
         return soup
 
     def populate_article_metadata(self, article, soup, first):
diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe
index d550a5158f..df44856293 100644
--- a/recipes/nytimes_sub.recipe
+++ b/recipes/nytimes_sub.recipe
@@ -15,6 +15,7 @@ from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
 class NYTimes(BasicNewsRecipe):
 
     recursions=1 # set this to zero to omit Related articles lists
+    match_regexps=[r'/[12][0-9][0-9][0-9]/[0-9]+/'] # speeds up processing by preventing index page links from being followed
 
     # set getTechBlogs to True to include the technology blogs
     # set tech_oldest_article to control article age
@@ -24,6 +25,14 @@ class NYTimes(BasicNewsRecipe):
     tech_oldest_article = 14
     tech_max_articles_per_feed = 25
 
+    # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles
+    # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category)
+    getPopularArticles = True
+    popularPeriod = '1' # set this to the number of days to include in the measurement
+                        # e.g. 7 will get the most popular measured over the last 7 days
+                        # and 30 will get the most popular measured over 30 days.
+                        # you still only get up to 20 articles in each category
+
 
     # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
     headlinesOnly = False
@@ -376,6 +385,7 @@ class NYTimes(BasicNewsRecipe):
 
     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
 
+
     def short_title(self):
         return self.title
 
@@ -384,6 +394,7 @@ class NYTimes(BasicNewsRecipe):
         from contextlib import closing
         import copy
         from calibre.ebooks.chardet import xml_to_unicode
+        print("ARTICLE_TO_SOUP "+url_or_raw)
         if re.match(r'\w+://', url_or_raw):
             br = self.clone_browser(self.browser)
             open_func = getattr(br, 'open_novisit', br.open)
@@ -475,6 +486,67 @@ class NYTimes(BasicNewsRecipe):
                             description=description, author=author,
                             content=''))
 
+    def get_popular_articles(self,ans):
+        if self.getPopularArticles:
+            popular_articles = {}
+            key_list = []
+
+            def handleh3(h3tag):
+                try:
+                    url = h3tag.a['href']
+                except:
+                    return ('','','','')
+                url = re.sub(r'\?.*', '', url)
+                if self.exclude_url(url):
+                    return ('','','','')
+                url += '?pagewanted=all'
+                title = self.tag_to_string(h3tag.a,False)
+                h6tag = h3tag.findNextSibling('h6')
+                if h6tag is not None:
+                    author = self.tag_to_string(h6tag,False)
+                else:
+                    author = ''
+                ptag = h3tag.findNextSibling('p')
+                if ptag is not None:
+                    desc = self.tag_to_string(ptag,False)
+                else:
+                    desc = ''
+                return(title,url,author,desc)
+
+
+            have_emailed = False
+            emailed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-emailed?period='+self.popularPeriod)
+            for h3tag in emailed_soup.findAll('h3'):
+                (title,url,author,desc) = handleh3(h3tag)
+                if url=='':
+                    continue
+                if not have_emailed:
+                    key_list.append('Most E-Mailed')
+                    popular_articles['Most E-Mailed'] = []
+                    have_emailed = True
+                popular_articles['Most E-Mailed'].append(
+                    dict(title=title, url=url, date=strftime('%a, %d %b'),
+                        description=desc, author=author,
+                        content=''))
+            have_viewed = False
+            viewed_soup = self.index_to_soup('http://www.nytimes.com/most-popular-viewed?period='+self.popularPeriod)
+            for h3tag in viewed_soup.findAll('h3'):
+                (title,url,author,desc) = handleh3(h3tag)
+                if url=='':
+                    continue
+                if not have_viewed:
+                    key_list.append('Most Viewed')
+                    popular_articles['Most Viewed'] = []
+                    have_viewed = True
+                popular_articles['Most Viewed'].append(
+                    dict(title=title, url=url, date=strftime('%a, %d %b'),
+                        description=desc, author=author,
+                        content=''))
+            viewed_ans = [(k, popular_articles[k]) for k in key_list if popular_articles.has_key(k)]
+            for x in viewed_ans:
+                ans.append(x)
+        return ans
+
     def get_tech_feeds(self,ans):
         if self.getTechBlogs:
             tech_articles = {}
@@ -536,7 +608,7 @@ class NYTimes(BasicNewsRecipe):
                         self.handle_article(lidiv)
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.ans))
+        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
 
     def parse_todays_index(self):
@@ -569,7 +641,7 @@ class NYTimes(BasicNewsRecipe):
                         self.handle_article(lidiv)
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.ans))
+        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
     def parse_headline_index(self):
 
@@ -643,7 +715,7 @@ class NYTimes(BasicNewsRecipe):
                         self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
-        return self.filter_ans(self.get_tech_feeds(self.ans))
+        return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans)))
 
     def parse_index(self):
         if self.headlinesOnly:
@@ -731,7 +803,7 @@ class NYTimes(BasicNewsRecipe):
 
 
     def preprocess_html(self, soup):
-        #print("PREPROCESS TITLE="+self.tag_to_string(soup.title))
+        #print(strftime("%H:%M:%S")+" --  PREPROCESS TITLE="+self.tag_to_string(soup.title))
         skip_tag = soup.find(True, {'name':'skip'})
         if skip_tag is not None:
             #url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
@@ -907,6 +979,7 @@ class NYTimes(BasicNewsRecipe):
             for aside in soup.findAll('div','aside'):
                 aside.extract()
             soup = self.strip_anchors(soup,True)
+            #print("RECURSIVE: "+self.tag_to_string(soup.title))
 
         if soup.find('div',attrs={'id':'blogcontent'}) is None:
             if first_fetch:
@@ -1071,7 +1144,7 @@ class NYTimes(BasicNewsRecipe):
                         divTag.replaceWith(tag)
             except:
                 self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
-
+        #print(strftime("%H:%M:%S")+" -- POSTPROCESS TITLE="+self.tag_to_string(soup.title))
         return soup
 
     def populate_article_metadata(self, article, soup, first):