From ec79457d453e914a85f715c85c6916db2af5a3dc Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 28 Feb 2008 04:21:20 +0000
Subject: [PATCH] Improved NY Times profile that corresponds to the daily paper

---
 .../ebooks/lrf/web/profiles/nytimes.py        | 54 ++++++++++++++-----
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
index 70bc308a81..7b194d5595 100644
--- a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
@@ -15,7 +15,7 @@
 '''
 Profile to download the New York Times
 '''
-import re
+import re, time
 
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
@@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
     timefmt = ' [%a, %d %b, %Y]'
     needs_subscription = True
     max_recursions = 2
+    recommended_frequency = 1
+    encoding = 'cp1252'
+    html2lrf_options = ['--base-font-size=0']
+    
     
     preprocess_regexps = \
             [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
@@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
             br.submit()
         return br
     
-    def get_feeds(self):
-        src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
-        soup = BeautifulSoup(src[src.index('<html'):])
-        feeds = []
-        for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
-            if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
-                                     'Dining & Wine', 'Home & Garden', 'Multimedia',
-                                     'Most E-mailed Articles', 
-                                     'Automobiles', 'Fashion & Style', 'Television News',
-                                     'Education']:
-                feeds.append((link['title'], link['href'].replace('graphics8', 'www')))            
+    def parse_feeds(self):
+        src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
+        soup = BeautifulSoup(src)
         
-        return feeds
+        def feed_title(div):
+            return ''.join(div.findAll(text=True, recursive=False)).strip()
+        
+        articles = {}
+        key = None
+        for div in soup.findAll(True, 
+            attrs={'class':['section-headline', 'story', 'story headline']}):
+            
+            if div['class'] == 'section-headline':
+                key = feed_title(div)
+                articles[key] = []
+            
+            elif div['class'] in ['story', 'story headline']:
+                a = div.find('a', href=True)
+                if not a:
+                    continue
+                url = self.print_version(a['href'])
+                title = self.tag_to_string(a, use_alt=True).strip()
+                description = ''
+                pubdate = time.strftime('%a, %d %b', time.localtime())
+                summary = div.find(True, attrs={'class':'summary'})
+                if summary:
+                    description = self.tag_to_string(summary, use_alt=False)
+                
+                feed = key if key is not None else 'Uncategorized'
+                if not articles.has_key(feed):
+                    articles[feed] = []
+                articles[feed].append(
+                    dict(title=title, url=url, date=pubdate, description=description,
+                         content=''))
+                
+            
+        return articles
     
     def print_version(self, url):
         return url + '?&pagewanted=print'