Improved NY Times profile that corresponds to the daily paper

2025-11-10 08:36:58 -05:00 · 2008-02-28 04:21:20 +00:00 · 2008-02-28 04:21:20 +00:00 · ec79457d45
commit ec79457d45
parent 1ddcfb5844
1 changed files with 41 additions and 13 deletions
--- a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
@ -15,7 +15,7 @@
 '''
 Profile to download the New York Times
 '''
-import re
+import re, time

 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = True
    max_recursions = 2
+    recommended_frequency = 1
+    encoding = 'cp1252'
+    html2lrf_options = ['--base-font-size=0']
+    
    
    preprocess_regexps = \
            [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
            br.submit()
        return br
    
-    def get_feeds(self):
-        src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
-        soup = BeautifulSoup(src[src.index('<html'):])
-        feeds = []
-        for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
-            if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
-                                     'Dining & Wine', 'Home & Garden', 'Multimedia',
-                                     'Most E-mailed Articles', 
-                                     'Automobiles', 'Fashion & Style', 'Television News',
-                                     'Education']:
-                feeds.append((link['title'], link['href'].replace('graphics8', 'www')))            
+    def parse_feeds(self):
+        src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
+        soup = BeautifulSoup(src)
        
-        return feeds
+        def feed_title(div):
+            return ''.join(div.findAll(text=True, recursive=False)).strip()
+        
+        articles = {}
+        key = None
+        for div in soup.findAll(True, 
+            attrs={'class':['section-headline', 'story', 'story headline']}):
+            
+            if div['class'] == 'section-headline':
+                key = feed_title(div)
+                articles[key] = []
+            
+            elif div['class'] in ['story', 'story headline']:
+                a = div.find('a', href=True)
+                if not a:
+                    continue
+                url = self.print_version(a['href'])
+                title = self.tag_to_string(a, use_alt=True).strip()
+                description = ''
+                pubdate = time.strftime('%a, %d %b', time.localtime())
+                summary = div.find(True, attrs={'class':'summary'})
+                if summary:
+                    description = self.tag_to_string(summary, use_alt=False)
+                
+                feed = key if key is not None else 'Uncategorized'
+                if not articles.has_key(feed):
+                    articles[feed] = []
+                articles[feed].append(
+                    dict(title=title, url=url, date=pubdate, description=description,
+                         content=''))
+                
+            
+        return articles
    
    def print_version(self, url):
        return url + '?&pagewanted=print'