From 22003b492cdbadf5dd223c34c9a9e82d72e94abd Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 5 Aug 2009 12:22:26 -0600
Subject: [PATCH] Improved recipes for The BBC, Slate, and NYT Headlines

---
 src/calibre/web/feeds/recipes/recipe_bbc.py   |  39 ++-
 .../web/feeds/recipes/recipe_nytimes.py       | 118 +++++---
 src/calibre/web/feeds/recipes/recipe_slate.py | 266 ++++++++++++------
 3 files changed, 286 insertions(+), 137 deletions(-)

diff --git a/src/calibre/web/feeds/recipes/recipe_bbc.py b/src/calibre/web/feeds/recipes/recipe_bbc.py
index 0c9c5f60c2..f82401f987 100644
--- a/src/calibre/web/feeds/recipes/recipe_bbc.py
+++ b/src/calibre/web/feeds/recipes/recipe_bbc.py
@@ -10,23 +10,34 @@ from calibre.web.feeds.news import BasicNewsRecipe
 
 class BBC(BasicNewsRecipe):
     title          = u'The BBC'
-    __author__     = 'Kovid Goyal and Sujata Raman'
+    __author__     = 'Kovid Goyal ans Sujata Raman'
     description    = 'Global news and current affairs from the British Broadcasting Corporation'
     language = _('English')
+    no_stylesheets = True
+    remove_tags    = [dict(name='div', attrs={'class':'footer'}),
+                      {'id' : ['popstory','blq-footer']},
+                      {'class' : ['arrup','links','relatedbbcsites','arr','promobottombg','bbccom_visibility_hidden', 'sharesb', 'sib606', 'mvtb', 'storyextra', 'sidebar1', 'bbccom_text','promotopbg', 'gppromo','promotopbg','bbccom_display_none']},
+                        	]
 
-    remove_tags    = [dict(name='div', attrs={'class':'footer'}),]
-
+    keep_only_tags = [dict(name='div', attrs={'class':'mainwrapper'})]
 
     extra_css      = '''
-                        body{font-family:Arial,Helvetica,sans-serif; font-size:small;}
+                        body{font-family:Arial,Helvetica,sans-serif; font-size:small; align:left}
                         h1{font-size:large;}
+                        .sh{font-size:large; font-weight:bold}
+                        .cap{font-size:xx-small; }
+                        .lu{font-size:xx-small; }
+                        .ds{font-size:xx-small; }
+                        .mvb{font-size:xx-small;}
+                        .by1{font-size:x-small;  color:#666666}
+                        .byd{font-size:x-small;}
                      '''
 
     feeds          = [
                       ('News Front Page', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml'),
                       ('Science/Nature', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/science/nature/rss.xml'),
                       ('Technology', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/technology/rss.xml'),
-                      ('Enterntainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
+                      ('Entertainment', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/entertainment/rss.xml'),
                       ('Magazine', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/uk_news/magazine/rss.xml'),
                       ('Business', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/business/rss.xml'),
                       ('Health', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/health/rss.xml'),
@@ -38,8 +49,22 @@ class BBC(BasicNewsRecipe):
                       ('Africa', 'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml'),
                     ]
 
+    def postprocess_html(self, soup, first):
 
-    def print_version(self, url):
-        return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
+            for tag in soup.findAll(name= 'img', alt=""):
+                    tag.extract()
+
+            for item in soup.findAll(align = "right"):
+                del item['align']
+
+            for tag in soup.findAll(name=['table', 'tr', 'td']):
+                tag.name = 'div'
+
+            return soup
+
+
+
+  #  def print_version(self, url):
+  #      return url.replace('http://', 'http://newsvote.bbc.co.uk/mpapps/pagetools/print/')
 
 
diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py
index c73468b51c..d57bd6594c 100644
--- a/src/calibre/web/feeds/recipes/recipe_nytimes.py
+++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py
@@ -8,7 +8,7 @@ nytimes.com
 import re
 from calibre import entity_to_unicode
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
 
 class NYTimes(BasicNewsRecipe):
 
@@ -42,36 +42,39 @@ class NYTimes(BasicNewsRecipe):
     # By default, no sections are skipped.  
     excludeSectionKeywords = []
 
-    # To skip sections containing the word 'Sports' or 'Dining', use:
+    # Add section keywords from the right column above to skip that section
+    # For example, to skip sections containing the word 'Sports' or 'Dining', use:
     # excludeSectionKeywords = ['Sports', 'Dining']
-
     # Fetch only Business and Technology
-    #excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
-
+    # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
     # Fetch only Top Stories
-    #excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
+    # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
     
     # The maximum number of articles that will be downloaded
-    max_articles_per_feed = 50
+    max_articles_per_feed = 40
 
     timefmt = ''
     needs_subscription = True
-    remove_tags_after  = dict(attrs={'id':['comments']})
-    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 
-                               'clearfix', 'nextArticleLink clearfix','inlineSearchControl',
-                               'columnGroup','entry-meta','entry-response module','jumpLink','nav',
-                               'columnGroup advertisementColumnGroup', 'kicker entry-category']}),
-                   dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 
-                            'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login',
-                            'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor',
-                            'adxLeaderboard']),
-                   dict(name=['script', 'noscript', 'style','hr'])]
+    keep_only_tags          = [ dict(attrs={   'id':['article']})]
+    remove_tags             = [ dict(attrs={'class':['nextArticleLink clearfix', 'clearfix',
+                                                     'inlineVideo left brightcove']}),
+                                dict(attrs={   'id':['toolsRight','inlineBox','sidebarArticles',
+                                                     'portfolioInline','articleInline','readerscomment']}) ]
+        
     encoding = 'cp1252'
     no_stylesheets = True
-    extra_css = '.headline  {text-align:left;}\n\
-                 .byline    {font:monospace; margin-bottom:0px;}\n\
-                 .source    {align:left;}\n\
-                 .credit    {text-align:right;font-size:smaller;}\n'
+    extra_css = '.headline      {text-align:    left;}\n    \
+                 .byline        {font-family:   monospace;  \
+                                 text-align:    left;       \
+                                 margin-bottom: 0px;}\n     \
+                 .timestamp     {font-size:     smaller;}\n \
+                 .source        {text-align:    left;}\n    \
+                 .image         {text-align:    center;}\n  \
+                 .credit        {text-align:    right;      \
+                                 font-size:     smaller;}\n \
+                 .articleBody   {text-align:    left;}\n    \
+                 .authorId      {text-align:    left;       \
+                                 font-style:    italic;}\n  '
 
     def get_browser(self):
         br = BasicNewsRecipe.get_browser()
@@ -113,6 +116,8 @@ class NYTimes(BasicNewsRecipe):
         if docEncoding == '' :
             docEncoding = self.encoding
 
+        if self.verbose > 2:
+            self.log( "  document encoding: '%s'" % docEncoding)
         if docEncoding != self.encoding :
             soup = get_the_soup(docEncoding, url_or_raw)         
 
@@ -189,7 +194,6 @@ class NYTimes(BasicNewsRecipe):
                             key = self.sections[section]
                             excluded = re.compile('|'.join(self.excludeSectionKeywords))
                             if excluded.search(key) or articles.has_key(key):
-                                if self.verbose : self.log("Skipping section %s" % key)
                                 skipThisSection = True
                                 break
 
@@ -200,8 +204,7 @@ class NYTimes(BasicNewsRecipe):
                         # Extract the bylines and descriptions
                         if (i.string is not None) and       \
                            (i.string.strip() > "") and      \
-                           not ('Comment' in str(i.__class__)) :
-
+                           not isinstance(i,Comment):
                             contentString = i.strip().encode('utf-8')
                             if contentString[0:3] == 'By ' :
                                 bylines.append(contentString)
@@ -212,8 +215,6 @@ class NYTimes(BasicNewsRecipe):
                     articleCount = len(sectionblock.findAll('span'))
                     for (i,span) in enumerate(sectionblock.findAll('span')) :
                         a = span.find('a', href=True)
-                        #if not a:
-                            #continue
                         url = re.sub(r'\?.*', '', a['href'])
                         url += '?pagewanted=all'
 
@@ -234,15 +235,13 @@ class NYTimes(BasicNewsRecipe):
                         # Check for duplicates
                         duplicateFound = False
                         if len(articles[feed]) > 1:
-                            #print articles[feed]
                             for article in articles[feed] :
-                                #print "comparing %s\n %s\n" % (url, article['url'])
                                 if url == article['url'] :
                                     duplicateFound = True
                                     break
-                            #print
                             
                             if duplicateFound:        
+                                # Continue fetching, don't add this article
                                 continue        
 
                         if not articles.has_key(feed):
@@ -252,33 +251,42 @@ class NYTimes(BasicNewsRecipe):
                                  description=description, author=author, content=''))
 
         ans = self.sort_index_by(ans, {'Top Stories':-1})
-        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]        
         return ans
 
+    def strip_anchors(self,soup):
+        paras = soup.findAll(True)
+        for para in paras:
+            aTags = para.findAll('a')
+            for a in aTags:
+                if a.img is None:
+                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
+        return soup
+
     def preprocess_html(self, soup):
         refresh = soup.find('meta', {'http-equiv':'refresh'})
         if refresh is None:
-            return soup
+            return self.strip_anchors(soup)
+
         content = refresh.get('content').partition('=')[2]
         raw = self.browser.open('http://www.nytimes.com'+content).read()
-        return BeautifulSoup(raw.decode('cp1252', 'replace'))
+        soup = BeautifulSoup(raw.decode('cp1252', 'replace'))
+        return self.strip_anchors(soup)
 
     def postprocess_html(self,soup, True):
+
         # Change class="kicker" to <h3>
         kicker = soup.find(True, {'class':'kicker'})
         if kicker is not None :
             h3Tag = Tag(soup, "h3")
-            h3Tag.insert(0, self.tag_to_string(kicker))
+            h3Tag.insert(0, kicker.contents[0])
             kicker.replaceWith(h3Tag)
 
         # Change captions to italic -1
         for caption in soup.findAll(True, {'class':'caption'}) :
             if caption is not None:
                 emTag = Tag(soup, "em")
-                #emTag['class'] = "caption"
-                #emTag['font-size-adjust'] = "-1"
-                emTag.insert(0, self.tag_to_string(caption))
+                emTag.insert(0, caption.contents[0])
                 hrTag = Tag(soup, 'hr')
                 emTag.insert(1, hrTag)
                 caption.replaceWith(emTag)
@@ -286,10 +294,10 @@ class NYTimes(BasicNewsRecipe):
         # Change <nyt_headline> to <h2>
         headline = soup.find("nyt_headline")
         if headline is not None :
-            h2tag = Tag(soup, "h2")
-            h2tag['class'] = "headline"
-            h2tag.insert(0, self.tag_to_string(headline))
-            headline.replaceWith(h2tag)
+            tag = Tag(soup, "h2")
+            tag['class'] = "headline"
+            tag.insert(0, headline.contents[0])
+            soup.h1.replaceWith(tag)
 
         # Change <h1> to <h3> - used in editorial blogs
         masthead = soup.find("h1")
@@ -297,14 +305,34 @@ class NYTimes(BasicNewsRecipe):
             # Nuke the href
             if masthead.a is not None :
                 del(masthead.a['href'])
-            h3tag = Tag(soup, "h3")
-            h3tag.insert(0, self.tag_to_string(masthead))
-            masthead.replaceWith(h3tag)
+            tag = Tag(soup, "h3")
+            tag.insert(0, masthead.contents[0])
+            soup.h1.replaceWith(tag)
 
         # Change <span class="bold"> to <b>
         for subhead in soup.findAll(True, {'class':'bold'}) :
             bTag = Tag(soup, "b")
-            bTag.insert(0, self.tag_to_string(subhead))
+            bTag.insert(0, subhead.contents[0])
             subhead.replaceWith(bTag)
+            
+        # Synthesize a section header
+        dsk = soup.find('meta', attrs={'name':'dsk'})
+        if dsk is not None and dsk.has_key('content'):
+            hTag = Tag(soup,'h3')
+            hTag['class'] = 'section'
+            hTag.insert(0,NavigableString(dsk['content']))
+            articleTag = soup.find(True, attrs={'id':'article'})
+            articleTag.insert(0,hTag)
+            
+        # Add class="articleBody" to <div> so we can format with CSS
+        divTag = soup.find('div',attrs={'id':'articleBody'})
+        if divTag is not None :
+            divTag['class'] = divTag['id']
+        
+        # Add class="authorId" to <div> so we can format with CSS
+        divTag = soup.find('div',attrs={'id':'authorId'})
+        if divTag is not None :
+            divTag['class'] = divTag['id']
 
         return soup
+
diff --git a/src/calibre/web/feeds/recipes/recipe_slate.py b/src/calibre/web/feeds/recipes/recipe_slate.py
index dae94573b0..93c37affd4 100644
--- a/src/calibre/web/feeds/recipes/recipe_slate.py
+++ b/src/calibre/web/feeds/recipes/recipe_slate.py
@@ -3,19 +3,19 @@
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
-Fetches the last 7 days of featured articles from slate.com
+calibre recipe for slate.com
 '''
 
-import re
+import string, re, sys
+from calibre import strftime
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
 
-class Slate(BasicNewsRecipe):
+class PeriodicalNameHere(BasicNewsRecipe):
     # Method variables for customizing downloads
     title                   = 'Slate'
-    description             = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
-    __author__              = 'GRiker@hotmail.com'
-    language                = _('English')
+    description             = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
+    __author__              = 'GRiker'
     max_articles_per_feed   = 40
     oldest_article          = 7.0
     recursions              = 0
@@ -26,33 +26,58 @@ class Slate(BasicNewsRecipe):
     feeds                   = None
     no_stylesheets          = True
     encoding                = None
+    language                = _('English')        
 
+    
+    
     # Method variables for customizing feed parsing
     summary_length          = 250
     use_embedded_content    = None
 
     # Method variables for pre/post processing of HTML
-    remove_tags             = [ dict(name=['link','style']),
-                                dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
-                                         'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
-                                         'fray_article_discussion','bizbox_sponsored_links_bottom',
-                                         'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
-                                         'article_top_wedge','content-top','page-title',
-                                         'block-today039s-business-press-archives','block-blog-roll',
-                                         'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
-                                         'service-links-bottom','comments','ft']),
-                                dict(attrs={'class':['fray_article_links','clearing','nav',
-                                            'service-links service-links-stack','yui-b last',
-                                            'read-more-comments']})]
-    extra_css = '.headline  {text-align:left;}\n\
-                 .byline    {font:monospace; text-align:left; margin-bottom:0pt;}\n\
-                 .dateline  {text-align:left; height:0pt;}\n\
-                 .source    {align:left;}\n\
-                 .credit    {text-align:right;font-size:smaller;}\n'
+    preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>', 
+                                        re.DOTALL|re.IGNORECASE),
+                                        lambda match: ''),
+                           (re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>', 
+                                        re.DOTALL|re.IGNORECASE),
+                                        lambda match: '')   ]
 
+    match_regexps           = []        
+    
+    # The second entry is for 'Big Money', which comes from a different site, uses different markup
+    keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body']}),
+                               dict(attrs={   'id':['content']})  ]
+                               
+    # The second entry is for 'Big Money', which comes from a different site, uses different markup
+    remove_tags             = [dict(attrs={   'id':['toolbox','recommend_tab','insider_ad_wrapper',
+                                                    'article_bottom_tools_cntr','fray_article_discussion',
+                                                    'fray_article_links','bottom_sponsored_links','author_bio',
+                                                    'bizbox_links_bottom','ris_links_wrapper','BOXXLE']}),
+                               dict(attrs={    'id':['content-top','service-links-bottom','hed']})   ]
+                               
+    excludedDescriptionKeywords =   ['Slate V','Twitter feed','podcast']
+    excludedTitleKeywords =         ['Gabfest','Slate V','on Twitter']
+    excludedAuthorKeywords =        []
+    excludedContentKeywords =       ['http://twitter.com/Slate']
+            
+    extra_css = '.headline      {text-align:left;}\n\
+                 .byline        {font-family:   monospace; \
+                                 text-align:    left;\
+                                 margin-bottom: 0px;}\n\
+                 .dateline      {text-align:    left; \
+                                 font-size:     smaller;\
+                                 height:        0pt;}\n\
+                 .imagewrapper  {text-align:    center;}\n\
+                 .source        {text-align:    left;}\n\
+                 .credit        {text-align:    right;\
+                                 font-size:     smaller;}\n\
+                 .article_body  {text-align:    left;}\n'
+    
+    # Local variables to extend class
     baseURL = 'http://slate.com'
     section_dates = []
-
+    
+    # class extension methods
     def tag_to_strings(self, tag):
         if not tag:
             return ''
@@ -68,16 +93,16 @@ class Slate(BasicNewsRecipe):
                     strings.append(res)
         return strings
 
+    
     def extract_sections(self):
         soup = self.index_to_soup( self.baseURL )
-
         soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
         soup = soup.find(True, attrs={'id':'toc_links_container'})
 
         todays_section = soup.find(True, attrs={'class':'todaydateline'})
-        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
-        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
-
+        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) 
+        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) 
+        
         older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
         for older_section in older_section_dates :
             self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
@@ -90,19 +115,20 @@ class Slate(BasicNewsRecipe):
         sections = []
         for section in section_lists :
             sections.append(section)
-
         return sections
 
-
+            
     def extract_section_articles(self, sections_html) :
+        #       Find the containers with section content
         soup = self.index_to_soup(str(sections_html))
         sections = soup.findAll('ul')
+        
         articles = {}
         key = None
         ans = []
-
+        
         for (i,section) in enumerate(sections) :
-
+            
             # Get the section name
             if section.has_key('id') :
                 key = self.section_dates[i]
@@ -110,14 +136,10 @@ class Slate(BasicNewsRecipe):
                 ans.append(key)
             else :
                 continue
-
+            
             # Get the section article_list
             article_list = section.findAll('li')
-
-            excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
-            excludedTitleKeywords = ['Gabfest','Slate V']
-            excludedAuthorKeywords = ['Prudence']
-
+            
             # Extract the article attributes
             for article in article_list :
                 bylines = self.tag_to_strings(article)
@@ -128,10 +150,10 @@ class Slate(BasicNewsRecipe):
                 author = None
                 description = None
                 pubdate = None
-
+                
                 if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
                     description = "A summary of what's in the major U.S. newspapers."
-
+                
                 if len(bylines) == 3 :
                     author = bylines[2].strip()
                     author = re.sub('[\r][\n][\t][\t\t]','', author)
@@ -142,7 +164,6 @@ class Slate(BasicNewsRecipe):
                         if full_byline.find('major U.S. newspapers') > 0 :
                             description = "A summary of what's in the major U.S. newspapers."
 
-
                 if len(bylines) > 3  and author is not None:
                     author += " | "
                     for (i,substring) in enumerate(bylines[3:]) :
@@ -152,38 +173,41 @@ class Slate(BasicNewsRecipe):
                             author += " | "
 
                 # Skip articles whose descriptions contain excluded keywords
-                if description is not None :
-                    excluded = re.compile('|'.join(excludedDescriptionKeywords))
+                if description is not None and len(self.excludedDescriptionKeywords):
+                    excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
                     found_excluded = excluded.search(description)
                     if found_excluded :
+                        if self.verbose : self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                         continue
 
                 # Skip articles whose title contain excluded keywords
-                if full_title is not None :
-                    excluded = re.compile('|'.join(excludedTitleKeywords))
+                if full_title is not None and len(self.excludedTitleKeywords):
+                    excluded = re.compile('|'.join(self.excludedTitleKeywords))
                     #self.log("evaluating full_title: %s" % full_title)
                     found_excluded = excluded.search(full_title)
                     if found_excluded :
+                        if self.verbose : self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                         continue
 
                 # Skip articles whose author contain excluded keywords
-                if author is not None :
-                    excluded = re.compile('|'.join(excludedAuthorKeywords))
+                if author is not None and len(self.excludedAuthorKeywords):
+                    excluded = re.compile('|'.join(self.excludedAuthorKeywords))
                     found_excluded = excluded.search(author)
                     if found_excluded :
+                        if self.verbose : self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
                         continue
 
-                skip_this_article = False
+                skip_this_article = False    
                 # Check to make sure we're not adding a duplicate
                 for article in articles[key] :
                     if article['url'] == url :
                         skip_this_article = True
                         break
-
+                        
                 if skip_this_article :
                     continue
 
-                # Build the dictionary entry for this article
+                # Build the dictionary entry for this article   
                 feed = key
                 if not articles.has_key(feed) :
                     articles[feed] = []
@@ -194,28 +218,34 @@ class Slate(BasicNewsRecipe):
                 if article['description'] is not None :
                     if article['description'].find('newspapers') > 0 :
                         articles[feed].insert(0,articles[feed].pop(i))
-
+                                        
 
         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
-        ans = self.remove_duplicates(ans)
+        ans = self.remove_duplicates(ans)        
         return ans
-
+    
     def flatten_document(self, ans):
         flat_articles = []
         for (i,section) in enumerate(ans) :
+            #self.log("flattening section %s: " % section[0])
             for article in section[1] :
+                #self.log("moving %s to flat_articles[]" % article['title'])
                 flat_articles.append(article)
         flat_section = ['All Articles', flat_articles]
-        flat_ans = [flat_section]
-
+        flat_ans = [flat_section]            
         return flat_ans
-
+        
     def remove_duplicates(self, ans):
+        # Return a stripped ans
         for (i,section) in enumerate(ans) :
+            #self.log("section %s: " % section[0])
             for article in section[1] :
+                #self.log("\t%s" % article['title'])
+                #self.log("\looking for %s" % article['url'])
                 for (j,subsequent_section) in enumerate(ans[i+1:]) :
                     for (k,subsequent_article) in enumerate(subsequent_section[1]) :
                         if article['url'] == subsequent_article['url'] :
+                            #self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
                             del subsequent_section[1][k]
         return ans
 
@@ -226,21 +256,80 @@ class Slate(BasicNewsRecipe):
     def parse_index(self) :
         sections = self.extract_sections()
         section_list = self.extract_section_articles(sections)
-        section_list = self.flatten_document(section_list)
+        section_list = self.flatten_document(section_list)        
         return section_list
+    
+    def get_browser(self) :
+        return BasicNewsRecipe.get_browser()
 
+    def stripAnchors(self,soup):
+        body = soup.find('div',attrs={'id':['article_body','content']})
+        if body is not None:
+            paras = body.findAll('p')
+            if paras is not None:
+                for para in paras:
+                    aTags = para.findAll('a')
+                    if aTags is not None:
+                        for a in aTags:
+                            if a.img is None:
+                                #print repr(a.renderContents())
+                                a.replaceWith(a.renderContents().decode('utf-8','replace'))
+        return soup
 
+    def preprocess_html(self, soup) :
+
+        # Remove 'grayPlus4.png' images
+        imgs = soup.findAll('img')
+        if imgs is not None:
+            for img in imgs:
+                if re.search("grayPlus4.png",str(img)):
+                    img.extract()
+
+        # Delete article based upon content keywords
+        if len(self.excludedDescriptionKeywords):
+            excluded = re.compile('|'.join(self.excludedContentKeywords))
+            found_excluded = excluded.search(str(soup))
+            if found_excluded :
+                return None
+
+        # Articles from www.thebigmoney.com use different tagging for byline, dateline and body
+        head = soup.find('head')
+        if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
+            byline = soup.find('div',attrs={'id':'byline'})
+            if byline is not None:
+                byline['class'] = byline['id']
+    
+            dateline = soup.find('div',attrs={'id':'dateline'})
+            if dateline is not None:
+                dateline['class'] = dateline['id']
+                
+            body = soup.find('div',attrs={'id':'content'})
+            if body is not None:
+                body['class'] = 'article_body'
+                
+            # Synthesize a department kicker
+            h3Tag = Tag(soup,'h3')
+            emTag = Tag(soup,'em')
+            emTag.insert(0,NavigableString("the big money: Today's business press"))
+            h3Tag.insert(0,emTag)
+            soup.body.insert(0,h3Tag)
+            
+        # Strip anchors from HTML                            
+        return self.stripAnchors(soup)
+        
     def postprocess_html(self, soup, first_fetch) :
+
         # Fix up dept_kicker as <h3><em>
-        dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
+        dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
         if dept_kicker is not None :
             kicker_strings = self.tag_to_strings(dept_kicker)
-            kicker = kicker_strings[2] + kicker_strings[3]
-            kicker = re.sub('.','',kicker)
+            #kicker = kicker_strings[2] + kicker_strings[3]
+            kicker = ''.join(kicker_strings[2:])
+            kicker = re.sub('\.','',kicker)
             h3Tag = Tag(soup, "h3")
             emTag = Tag(soup, "em")
+            emTag.insert(0,NavigableString(kicker))
             h3Tag.insert(0, emTag)
-            emTag.insert(0,kicker)
             dept_kicker.replaceWith(h3Tag)
 
         # Change <h1> to <h2>
@@ -258,17 +347,19 @@ class Slate(BasicNewsRecipe):
             headline.replaceWith(h2tag)
 
         # Fix up the concatenated byline and dateline
-        byline = soup.find(True,attrs={'class':'byline'})
+        byline = soup.find(True,attrs={'class':'byline'})            
         if byline is not None :
             bylineTag = Tag(soup,'div')
             bylineTag['class'] = 'byline'
+            #bylineTag['height'] = '0em'
             bylineTag.insert(0,self.tag_to_string(byline))
             byline.replaceWith(bylineTag)
-
+            
         dateline = soup.find(True, attrs={'class':'dateline'})
         if dateline is not None :
             datelineTag = Tag(soup, 'div')
             datelineTag['class'] = 'dateline'
+            #datelineTag['margin-top'] = '0em'
             datelineTag.insert(0,self.tag_to_string(dateline))
             dateline.replaceWith(datelineTag)
 
@@ -280,51 +371,56 @@ class Slate(BasicNewsRecipe):
                 hrTag = Tag(soup, 'hr')
                 emTag.insert(1, hrTag)
                 caption.replaceWith(emTag)
-
+                
+        # Fix photos
+        for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
+            if photo.a is not None and photo.a.img is not None:
+                divTag = Tag(soup,'div')
+                divTag['class'] ='imagewrapper'
+                divTag.insert(0,photo.a.img)
+                photo.replaceWith(divTag)
+        
         return soup
-
+        
     def postprocess_book(self, oeb, opts, log) :
 
         def extract_byline(href) :
-            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
-            byline = soup.find(True,attrs={'class':'byline'})
-            if byline is not None:
+            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))            
+            byline = soup.find(True,attrs={'class':'byline'})            
+            if byline is not None:            
                 return self.tag_to_string(byline,use_alt=False)
             else :
-                return None
-
+                return None    
+        
         def extract_description(href) :
             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
             paragraphs = soup.findAll('p')
             for p in paragraphs :
                 if self.tag_to_string(p,use_alt=False).startswith('By ') or \
                    self.tag_to_string(p,use_alt=False).startswith('Posted '):
+                    continue                
+                comment = p.find(text=lambda text:isinstance(text, Comment))
+                if comment is not None:
                     continue
-
-                images = p.findAll(True, attrs={'class':'imagewrapper'})
-                for image in images :
-                    image.extract()
-                return self.tag_to_string(p,use_alt=False)[:200] + '...'
-
+                else:
+                    return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
+                    
             return None
-
+                                 
+        # Method entry point here
+        # Single section toc looks different than multi-section tocs
         if oeb.toc.depth() == 2 :
             for article in oeb.toc :
                 if article.author is None :
                     article.author = extract_byline(article.href)
-
                 if article.description is None :
                     article.description = extract_description(article.href)
-
-
         elif oeb.toc.depth() == 3 :
             for section in oeb.toc :
                 for article in section :
                     if article.author is None :
                         article.author = extract_byline(article.href)
-
                     if article.description is None :
                         article.description = extract_description(article.href)
-
-
-
+        
+    
\ No newline at end of file