From 54f93e96b7daa979f45b50e696a7cea369323dfd Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 20 Nov 2010 08:43:27 -0700
Subject: [PATCH] Improved NY times

---
 resources/recipes/nytimes.recipe     | 304 +++++++++++++++++++--------
 resources/recipes/nytimes_sub.recipe | 304 +++++++++++++++++++--------
 2 files changed, 440 insertions(+), 168 deletions(-)

diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe
index 16ddea9f8c..fbb4641580 100644
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@@ -7,14 +7,22 @@ nytimes.com
 '''
 import re, string, time
 from calibre import entity_to_unicode, strftime
+from datetime import timedelta, date
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
 
+
 class NYTimes(BasicNewsRecipe):
 
-    # set headlinesOnly to True for the headlines-only version
+    # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
     headlinesOnly = True
 
+    # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
+    # number of days old an article can be for inclusion. If oldest_article = 0 all articles
+    # will be included. Note: oldest_article is ignored if webEdition = False
+    webEdition = False
+    oldest_article = 7
+
     # includeSections: List of sections to include. If empty, all sections found will be included.
     # Otherwise, only the sections named will be included. For example,
     #
@@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
     # from an article (if one exists).  If one_picture_per_article = True, the image
     # will be moved to a location between the headline and the byline.
     # If one_picture_per_article = False, all images from the article will be included
-
     # and shown in their original location.
-    one_picture_per_article = True
+    one_picture_per_article = False
 
     # The maximum number of articles that will be downloaded
     max_articles_per_feed = 100
 
+    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
+    # more than one section). If True, only the first occurance will be downloaded.
+    filterDuplicates = True
+
+    # Sections to collect for the Web edition.
+    # Delete any you don't want, or use includeSections or excludeSections
+    web_sections = [(u'World',u'world'),
+                    (u'U.S.',u'national'),
+                    (u'Politics',u'politics'),
+                    (u'New York',u'nyregion'),
+                    (u'Business','business'),
+                    (u'Technology',u'technology'),
+                    (u'Sports',u'sports'),
+                    (u'Science',u'science'),
+                    (u'Health',u'health'),
+                    (u'Opinion',u'opinion'),
+                    (u'Arts',u'arts'),
+                    (u'Books',u'books'),
+                    (u'Movies',u'movies'),
+                    (u'Music',u'arts/music'),
+                    (u'Television',u'arts/television'),
+                    (u'Style',u'style'),
+                    (u'Dining & Wine',u'dining'),
+                    (u'Fashion & Style',u'fashion'),
+                    (u'Home & Garden',u'garden'),
+                    (u'Travel',u'travel'),
+                    ('Education',u'education'),
+                    ('Multimedia',u'multimedia'),
+                    (u'Obituaries',u'obituaries'),
+                    (u'Sunday Magazine',u'magazine'),
+                    (u'Week in Review',u'weekinreview')]
+
 
     if headlinesOnly:
         title='New York Times Headlines'
         description = 'Headlines from the New York Times'
+        needs_subscription = False
+    elif webEdition:
+        title='New York Times (Web)'
+        description = 'New York Times on the Web'
+        needs_subscription = True
     else:
         title='New York Times'
         description = 'Today\'s New York Times'
+        needs_subscription = True
+
+
+    month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
+
+    def decode_us_date(self,datestr):
+        udate = datestr.strip().lower().split()
+        try:
+            m = self.month_list.index(udate[0])+1
+        except:
+            return date.today()
+        d = int(udate[1])
+        y = int(udate[2])
+        try:
+            d = date(y,m,d)
+        except:
+            d = date.today
+        return d
+
+    earliest_date = date.today() - timedelta(days=oldest_article)
 
     __author__  = 'GRiker/Kovid Goyal/Nick Redding'
     language = 'en'
@@ -136,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
                 .image {text-align: center;}
                 .source {text-align: left; }'''
 
+
+    articles = {}
+    key = None
+    ans = []
+    url_list = []
+
     def filter_ans(self, ans) :
         total_article_count = 0
         idx = 0
@@ -164,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
         self.log( "Queued %d articles" % total_article_count )
         return ans
 
+    def exclude_url(self,url):
+        if not url.startswith("http"):
+            return True
+        if not url.endswith(".html"):
+            return True
+        if 'nytimes.com' not in url:
+            return True
+        if 'podcast' in url:
+            return True
+        if '/video/' in url:
+            return True
+        if '/slideshow/' in url:
+            return True
+        if '/magazine/index' in url:
+            return True
+        if '/interactive/' in url:
+            return True
+        if '/reference/' in url:
+            return True
+        if '/premium/' in url:
+            return True
+        return False
+
     def fixChars(self,string):
         # Replace lsquo (\x91)
         fixed = re.sub("\x91","‘",string)
@@ -249,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
             return BeautifulSoup(_raw, markupMassage=massage)
 
         # Entry point
-        print "index_to_soup()"
         soup = get_the_soup( self.encoding, url_or_raw )
         contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
         docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
@@ -273,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
         else:
             return description
 
-    def parse_todays_index(self):
+    def feed_title(self,div):
+        return ''.join(div.findAll(text=True, recursive=True)).strip()
 
-        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=True)).strip()
-
-        articles = {}
-        key = None
-        ans = []
-        url_list = []
-
-        def handle_article(div):
-            a = div.find('a', href=True)
-            if not a:
+    def handle_article(self,div):
+        thumbnail = div.find('div','thumbnail')
+        if thumbnail:
+            thumbnail.extract()
+        a = div.find('a', href=True)
+        if not a:
+            return
+        url = re.sub(r'\?.*', '', a['href'])
+        if self.exclude_url(url):
+            return
+        url += '?pagewanted=all'
+        if self.filterDuplicates:
+            if url in self.url_list:
                 return
-            url = re.sub(r'\?.*', '', a['href'])
-            if not url.startswith("http"):
-                return
-            if not url.endswith(".html"):
-                return
-            if 'podcast' in url:
-                return
-            if '/video/' in url:
-                return
-            url += '?pagewanted=all'
-            if url in url_list:
-                return
-            url_list.append(url)
-            title = self.tag_to_string(a, use_alt=True).strip()
-            description = ''
-            pubdate = strftime('%a, %d %b')
-            summary = div.find(True, attrs={'class':'summary'})
-            if summary:
-                description = self.tag_to_string(summary, use_alt=False)
-            author = ''
+        self.url_list.append(url)
+        title = self.tag_to_string(a, use_alt=True).strip()
+        description = ''
+        pubdate = strftime('%a, %d %b')
+        summary = div.find(True, attrs={'class':'summary'})
+        if summary:
+            description = self.tag_to_string(summary, use_alt=False)
+        author = ''
+        authorAttribution = div.find(True, attrs={'class':'byline'})
+        if authorAttribution:
+            author = self.tag_to_string(authorAttribution, use_alt=False)
+        else:
             authorAttribution = div.find(True, attrs={'class':'byline'})
             if authorAttribution:
                 author = self.tag_to_string(authorAttribution, use_alt=False)
-            else:
-                authorAttribution = div.find(True, attrs={'class':'byline'})
-                if authorAttribution:
-                    author = self.tag_to_string(authorAttribution, use_alt=False)
-            feed = key if key is not None else 'Uncategorized'
-            if not articles.has_key(feed):
-                ans.append(feed)
-                articles[feed] = []
-            articles[feed].append(
-                            dict(title=title, url=url, date=pubdate,
-                                description=description, author=author,
-                                content=''))
+        feed = self.key if self.key is not None else 'Uncategorized'
+        if not self.articles.has_key(feed):
+            self.ans.append(feed)
+            self.articles[feed] = []
+        self.articles[feed].append(
+                        dict(title=title, url=url, date=pubdate,
+                            description=description, author=author,
+                            content=''))
 
 
+    def parse_web_edition(self):
+
+        for (sec_title,index_url) in self.web_sections:
+            if self.includeSections != []:
+                if sec_title not in self.includeSections:
+                    print "SECTION NOT INCLUDED: ",sec_title
+                    continue
+            if sec_title in self.excludeSections:
+                print "SECTION EXCLUDED: ",sec_title
+                continue
+            print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
+            soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
+            self.key = sec_title
+            # Find each article
+            for div in soup.findAll(True,
+                attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
+                if div['class'] in ['story', 'story headline'] :
+                    self.handle_article(div)
+                elif div['class'] == 'headlinesOnly multiline flush':
+                    for lidiv in div.findAll('li'):
+                        self.handle_article(lidiv)
+
+        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        return self.filter_ans(self.ans)
+
+
+    def parse_todays_index(self):
+
         soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
 
-
+        skipping = False
         # Find each article
         for div in soup.findAll(True,
             attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
 
             if div['class'] in ['section-headline','sectionHeader']:
-                key = string.capwords(feed_title(div))
-                key = key.replace('Op-ed','Op-Ed')
-                key = key.replace('U.s.','U.S.')
+                self.key = string.capwords(self.feed_title(div))
+                self.key = self.key.replace('Op-ed','Op-Ed')
+                self.key = self.key.replace('U.s.','U.S.')
+                self.key = self.key.replace('N.y.','N.Y.')
+                skipping = False
+                if self.includeSections != []:
+                    if self.key not in self.includeSections:
+                        print "SECTION NOT INCLUDED: ",self.key
+                        skipping = True
+                if self.key in self.excludeSections:
+                    print "SECTION EXCLUDED: ",self.key
+                    skipping = True
+
             elif div['class'] in ['story', 'story headline'] :
-                handle_article(div)
+                if not skipping:
+                    self.handle_article(div)
             elif div['class'] == 'headlinesOnly multiline flush':
                 for lidiv in div.findAll('li'):
-                    handle_article(lidiv)
+                    if not skipping:
+                        self.handle_article(lidiv)
 
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return self.filter_ans(ans)
+        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        return self.filter_ans(self.ans)
 
     def parse_headline_index(self):
 
-        articles = {}
-        ans = []
-        url_list = []
-
         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
 
         # Fetch the content table
@@ -363,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
         for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
             for div_sec in td_col.findAll('div',recursive=False):
                 for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
+
                     section_name = self.tag_to_string(h6_sec_name,use_alt=False)
                     section_name = re.sub(r'^ *$','',section_name)
+
                     if section_name == '':
                         continue
+                    if self.includeSections != []:
+                        if section_name not in self.includeSections:
+                            print "SECTION NOT INCLUDED: ",section_name
+                            continue
+                    if section_name in self.excludeSections:
+                        print "SECTION EXCLUDED: ",section_name
+                        continue
+
                     section_name=string.capwords(section_name)
-                    if section_name == 'U.s.':
-                       section_name = 'U.S.'
-                    elif section_name == 'Op-ed':
-                       section_name = 'Op-Ed'
+                    section_name = section_name.replace('Op-ed','Op-Ed')
+                    section_name = section_name.replace('U.s.','U.S.')
+                    section_name = section_name.replace('N.y.','N.Y.')
                     pubdate = strftime('%a, %d %b')
 
                     search_div = div_sec
@@ -392,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
                         if not a:
                             continue
                         url = re.sub(r'\?.*', '', a['href'])
-                        if not url.startswith("http"):
-                            continue
-                        if not url.endswith(".html"):
-                            continue
-                        if 'podcast' in url:
-                            continue
-                        if 'video' in url:
+                        if self.exclude_url(url):
                             continue
                         url += '?pagewanted=all'
-                        if url in url_list:
-                            continue
-                        url_list.append(url)
-                        self.log("URL %s" % url)
+                        if self.filterDuplicates:
+                            if url in self.url_list:
+                                continue
+                        self.url_list.append(url)
                         title = self.tag_to_string(a, use_alt=True).strip()
                         desc = h3_item.find('p')
                         if desc is not None:
                             description = self.tag_to_string(desc,use_alt=False)
                         else:
                             description = ''
-                        if not articles.has_key(section_name):
-                            ans.append(section_name)
-                            articles[section_name] = []
-                        articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
+                        if not self.articles.has_key(section_name):
+                            self.ans.append(section_name)
+                            self.articles[section_name] = []
+                        self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
-
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return self.filter_ans(ans)
+        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        return self.filter_ans(self.ans)
 
     def parse_index(self):
         if self.headlinesOnly:
             return self.parse_headline_index()
+        elif self.webEdition:
+            return self.parse_web_edition()
         else:
             return self.parse_todays_index()
 
@@ -438,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
 
     def preprocess_html(self, soup):
 
+        if self.webEdition & (self.oldest_article>0):
+            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
+            if date_tag:
+                date_str = self.tag_to_string(date_tag,use_alt=False)
+                date_str = date_str.replace('Published:','')
+                date_items = date_str.split(',')
+                try:
+                    datestring = date_items[0]+' '+date_items[1]
+                    article_date = self.decode_us_date(datestring)
+                except:
+                    article_date = date.today()
+                if article_date < self.earliest_date:
+                    self.log("Skipping article dated %s" % date_str)
+                    return None
+
         kicker_tag = soup.find(attrs={'class':'kicker'})
         if kicker_tag: # remove Op_Ed author head shots
             tagline = self.tag_to_string(kicker_tag)
@@ -462,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
                     for inlineImg in inlineImgs[1:]:
                         inlineImg.extract()
                     # Move firstImg before article body
-                    #article_body = soup.find(True, {'id':'articleBody'})
                     cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                     if cgFirst:
                         # Strip all sibling NavigableStrings: noise
@@ -548,4 +685,3 @@ class NYTimes(BasicNewsRecipe):
             divTag.replaceWith(tag)
 
         return soup
-
diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index ed1ba75f0f..ad98b466e1 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -7,14 +7,22 @@ nytimes.com
 '''
 import re, string, time
 from calibre import entity_to_unicode, strftime
+from datetime import timedelta, date
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
 
+
 class NYTimes(BasicNewsRecipe):
 
-    # set headlinesOnly to True for the headlines-only version
+    # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
     headlinesOnly = False
 
+    # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
+    # number of days old an article can be for inclusion. If oldest_article = 0 all articles
+    # will be included. Note: oldest_article is ignored if webEdition = False
+    webEdition = False
+    oldest_article = 7
+
     # includeSections: List of sections to include. If empty, all sections found will be included.
     # Otherwise, only the sections named will be included. For example,
     #
@@ -39,20 +47,76 @@ class NYTimes(BasicNewsRecipe):
     # from an article (if one exists).  If one_picture_per_article = True, the image
     # will be moved to a location between the headline and the byline.
     # If one_picture_per_article = False, all images from the article will be included
-
     # and shown in their original location.
-    one_picture_per_article = True
+    one_picture_per_article = False
 
     # The maximum number of articles that will be downloaded
     max_articles_per_feed = 100
 
+    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
+    # more than one section). If True, only the first occurance will be downloaded.
+    filterDuplicates = True
+
+    # Sections to collect for the Web edition.
+    # Delete any you don't want, or use includeSections or excludeSections
+    web_sections = [(u'World',u'world'),
+                    (u'U.S.',u'national'),
+                    (u'Politics',u'politics'),
+                    (u'New York',u'nyregion'),
+                    (u'Business','business'),
+                    (u'Technology',u'technology'),
+                    (u'Sports',u'sports'),
+                    (u'Science',u'science'),
+                    (u'Health',u'health'),
+                    (u'Opinion',u'opinion'),
+                    (u'Arts',u'arts'),
+                    (u'Books',u'books'),
+                    (u'Movies',u'movies'),
+                    (u'Music',u'arts/music'),
+                    (u'Television',u'arts/television'),
+                    (u'Style',u'style'),
+                    (u'Dining & Wine',u'dining'),
+                    (u'Fashion & Style',u'fashion'),
+                    (u'Home & Garden',u'garden'),
+                    (u'Travel',u'travel'),
+                    ('Education',u'education'),
+                    ('Multimedia',u'multimedia'),
+                    (u'Obituaries',u'obituaries'),
+                    (u'Sunday Magazine',u'magazine'),
+                    (u'Week in Review',u'weekinreview')]
+
 
     if headlinesOnly:
         title='New York Times Headlines'
         description = 'Headlines from the New York Times'
+        needs_subscription = False
+    elif webEdition:
+        title='New York Times (Web)'
+        description = 'New York Times on the Web'
+        needs_subscription = True
     else:
         title='New York Times'
         description = 'Today\'s New York Times'
+        needs_subscription = True
+
+
+    month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
+
+    def decode_us_date(self,datestr):
+        udate = datestr.strip().lower().split()
+        try:
+            m = self.month_list.index(udate[0])+1
+        except:
+            return date.today()
+        d = int(udate[1])
+        y = int(udate[2])
+        try:
+            d = date(y,m,d)
+        except:
+            d = date.today
+        return d
+
+    earliest_date = date.today() - timedelta(days=oldest_article)
 
     __author__  = 'GRiker/Kovid Goyal/Nick Redding'
     language = 'en'
@@ -60,7 +124,6 @@ class NYTimes(BasicNewsRecipe):
 
 
     timefmt = ''
-    needs_subscription = True
     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
     cover_margins = (18,18,'grey99')
 
@@ -137,6 +200,12 @@ class NYTimes(BasicNewsRecipe):
                 .image {text-align: center;}
                 .source {text-align: left; }'''
 
+
+    articles = {}
+    key = None
+    ans = []
+    url_list = []
+
     def filter_ans(self, ans) :
         total_article_count = 0
         idx = 0
@@ -165,6 +234,29 @@ class NYTimes(BasicNewsRecipe):
         self.log( "Queued %d articles" % total_article_count )
         return ans
 
+    def exclude_url(self,url):
+        if not url.startswith("http"):
+            return True
+        if not url.endswith(".html"):
+            return True
+        if 'nytimes.com' not in url:
+            return True
+        if 'podcast' in url:
+            return True
+        if '/video/' in url:
+            return True
+        if '/slideshow/' in url:
+            return True
+        if '/magazine/index' in url:
+            return True
+        if '/interactive/' in url:
+            return True
+        if '/reference/' in url:
+            return True
+        if '/premium/' in url:
+            return True
+        return False
+
     def fixChars(self,string):
         # Replace lsquo (\x91)
         fixed = re.sub("\x91","‘",string)
@@ -250,7 +342,6 @@ class NYTimes(BasicNewsRecipe):
             return BeautifulSoup(_raw, markupMassage=massage)
 
         # Entry point
-        print "index_to_soup()"
         soup = get_the_soup( self.encoding, url_or_raw )
         contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
         docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
@@ -274,83 +365,110 @@ class NYTimes(BasicNewsRecipe):
         else:
             return description
 
-    def parse_todays_index(self):
+    def feed_title(self,div):
+        return ''.join(div.findAll(text=True, recursive=True)).strip()
 
-        def feed_title(div):
-            return ''.join(div.findAll(text=True, recursive=True)).strip()
-
-        articles = {}
-        key = None
-        ans = []
-        url_list = []
-
-        def handle_article(div):
-            a = div.find('a', href=True)
-            if not a:
+    def handle_article(self,div):
+        thumbnail = div.find('div','thumbnail')
+        if thumbnail:
+            thumbnail.extract()
+        a = div.find('a', href=True)
+        if not a:
+            return
+        url = re.sub(r'\?.*', '', a['href'])
+        if self.exclude_url(url):
+            return
+        url += '?pagewanted=all'
+        if self.filterDuplicates:
+            if url in self.url_list:
                 return
-            url = re.sub(r'\?.*', '', a['href'])
-            if not url.startswith("http"):
-                return
-            if not url.endswith(".html"):
-                return
-            if 'podcast' in url:
-                return
-            if '/video/' in url:
-                return
-            url += '?pagewanted=all'
-            if url in url_list:
-                return
-            url_list.append(url)
-            title = self.tag_to_string(a, use_alt=True).strip()
-            description = ''
-            pubdate = strftime('%a, %d %b')
-            summary = div.find(True, attrs={'class':'summary'})
-            if summary:
-                description = self.tag_to_string(summary, use_alt=False)
-            author = ''
+        self.url_list.append(url)
+        title = self.tag_to_string(a, use_alt=True).strip()
+        description = ''
+        pubdate = strftime('%a, %d %b')
+        summary = div.find(True, attrs={'class':'summary'})
+        if summary:
+            description = self.tag_to_string(summary, use_alt=False)
+        author = ''
+        authorAttribution = div.find(True, attrs={'class':'byline'})
+        if authorAttribution:
+            author = self.tag_to_string(authorAttribution, use_alt=False)
+        else:
             authorAttribution = div.find(True, attrs={'class':'byline'})
             if authorAttribution:
                 author = self.tag_to_string(authorAttribution, use_alt=False)
-            else:
-                authorAttribution = div.find(True, attrs={'class':'byline'})
-                if authorAttribution:
-                    author = self.tag_to_string(authorAttribution, use_alt=False)
-            feed = key if key is not None else 'Uncategorized'
-            if not articles.has_key(feed):
-                ans.append(feed)
-                articles[feed] = []
-            articles[feed].append(
-                            dict(title=title, url=url, date=pubdate,
-                                description=description, author=author,
-                                content=''))
+        feed = self.key if self.key is not None else 'Uncategorized'
+        if not self.articles.has_key(feed):
+            self.ans.append(feed)
+            self.articles[feed] = []
+        self.articles[feed].append(
+                        dict(title=title, url=url, date=pubdate,
+                            description=description, author=author,
+                            content=''))
 
 
+    def parse_web_edition(self):
+
+        for (sec_title,index_url) in self.web_sections:
+            if self.includeSections != []:
+                if sec_title not in self.includeSections:
+                    print "SECTION NOT INCLUDED: ",sec_title
+                    continue
+            if sec_title in self.excludeSections:
+                print "SECTION EXCLUDED: ",sec_title
+                continue
+            print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
+            soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
+            self.key = sec_title
+            # Find each article
+            for div in soup.findAll(True,
+                attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
+                if div['class'] in ['story', 'story headline'] :
+                    self.handle_article(div)
+                elif div['class'] == 'headlinesOnly multiline flush':
+                    for lidiv in div.findAll('li'):
+                        self.handle_article(lidiv)
+
+        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        return self.filter_ans(self.ans)
+
+
+    def parse_todays_index(self):
+
         soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
 
-
+        skipping = False
         # Find each article
         for div in soup.findAll(True,
             attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
 
             if div['class'] in ['section-headline','sectionHeader']:
-                key = string.capwords(feed_title(div))
-                key = key.replace('Op-ed','Op-Ed')
-                key = key.replace('U.s.','U.S.')
+                self.key = string.capwords(self.feed_title(div))
+                self.key = self.key.replace('Op-ed','Op-Ed')
+                self.key = self.key.replace('U.s.','U.S.')
+                self.key = self.key.replace('N.y.','N.Y.')
+                skipping = False
+                if self.includeSections != []:
+                    if self.key not in self.includeSections:
+                        print "SECTION NOT INCLUDED: ",self.key
+                        skipping = True
+                if self.key in self.excludeSections:
+                    print "SECTION EXCLUDED: ",self.key
+                    skipping = True
+
             elif div['class'] in ['story', 'story headline'] :
-                handle_article(div)
+                if not skipping:
+                    self.handle_article(div)
             elif div['class'] == 'headlinesOnly multiline flush':
                 for lidiv in div.findAll('li'):
-                    handle_article(lidiv)
+                    if not skipping:
+                        self.handle_article(lidiv)
 
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return self.filter_ans(ans)
+        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        return self.filter_ans(self.ans)
 
     def parse_headline_index(self):
 
-        articles = {}
-        ans = []
-        url_list = []
-
         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
 
         # Fetch the content table
@@ -364,15 +482,24 @@ class NYTimes(BasicNewsRecipe):
         for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
             for div_sec in td_col.findAll('div',recursive=False):
                 for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
+
                     section_name = self.tag_to_string(h6_sec_name,use_alt=False)
                     section_name = re.sub(r'^ *$','',section_name)
+
                     if section_name == '':
                         continue
+                    if self.includeSections != []:
+                        if section_name not in self.includeSections:
+                            print "SECTION NOT INCLUDED: ",section_name
+                            continue
+                    if section_name in self.excludeSections:
+                        print "SECTION EXCLUDED: ",section_name
+                        continue
+
                     section_name=string.capwords(section_name)
-                    if section_name == 'U.s.':
-                       section_name = 'U.S.'
-                    elif section_name == 'Op-ed':
-                       section_name = 'Op-Ed'
+                    section_name = section_name.replace('Op-ed','Op-Ed')
+                    section_name = section_name.replace('U.s.','U.S.')
+                    section_name = section_name.replace('N.y.','N.Y.')
                     pubdate = strftime('%a, %d %b')
 
                     search_div = div_sec
@@ -393,37 +520,32 @@ class NYTimes(BasicNewsRecipe):
                         if not a:
                             continue
                         url = re.sub(r'\?.*', '', a['href'])
-                        if not url.startswith("http"):
-                            continue
-                        if not url.endswith(".html"):
-                            continue
-                        if 'podcast' in url:
-                            continue
-                        if 'video' in url:
+                        if self.exclude_url(url):
                             continue
                         url += '?pagewanted=all'
-                        if url in url_list:
-                            continue
-                        url_list.append(url)
-                        self.log("URL %s" % url)
+                        if self.filterDuplicates:
+                            if url in self.url_list:
+                                continue
+                        self.url_list.append(url)
                         title = self.tag_to_string(a, use_alt=True).strip()
                         desc = h3_item.find('p')
                         if desc is not None:
                             description = self.tag_to_string(desc,use_alt=False)
                         else:
                             description = ''
-                        if not articles.has_key(section_name):
-                            ans.append(section_name)
-                            articles[section_name] = []
-                        articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
+                        if not self.articles.has_key(section_name):
+                            self.ans.append(section_name)
+                            self.articles[section_name] = []
+                        self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
 
-
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        return self.filter_ans(ans)
+        self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
+        return self.filter_ans(self.ans)
 
     def parse_index(self):
         if self.headlinesOnly:
             return self.parse_headline_index()
+        elif self.webEdition:
+            return self.parse_web_edition()
         else:
             return self.parse_todays_index()
 
@@ -439,6 +561,21 @@ class NYTimes(BasicNewsRecipe):
 
     def preprocess_html(self, soup):
 
+        if self.webEdition & (self.oldest_article>0):
+            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
+            if date_tag:
+                date_str = self.tag_to_string(date_tag,use_alt=False)
+                date_str = date_str.replace('Published:','')
+                date_items = date_str.split(',')
+                try:
+                    datestring = date_items[0]+' '+date_items[1]
+                    article_date = self.decode_us_date(datestring)
+                except:
+                    article_date = date.today()
+                if article_date < self.earliest_date:
+                    self.log("Skipping article dated %s" % date_str)
+                    return None
+
         kicker_tag = soup.find(attrs={'class':'kicker'})
         if kicker_tag: # remove Op_Ed author head shots
             tagline = self.tag_to_string(kicker_tag)
@@ -463,7 +600,6 @@ class NYTimes(BasicNewsRecipe):
                     for inlineImg in inlineImgs[1:]:
                         inlineImg.extract()
                     # Move firstImg before article body
-                    #article_body = soup.find(True, {'id':'articleBody'})
                     cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                     if cgFirst:
                         # Strip all sibling NavigableStrings: noise