Improved NY Times profile that corresponds to the daily paper

2025-06-23 15:30:45 -04:00 · 2008-02-28 04:21:20 +00:00 · 2008-02-28 04:21:20 +00:00 · ec79457d45
commit ec79457d45
parent 1ddcfb5844
1 changed files with 41 additions and 13 deletions
--- a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py
@ -15,7 +15,7 @@
 '''
 Profile to download the New York Times
 '''
-import re
+import re, time
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
@ -26,6 +26,10 @@ class NYTimes(DefaultProfile):
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = True
    max_recursions = 2
    recommended_frequency = 1
    encoding = 'cp1252'
    html2lrf_options = ['--base-font-size=0']
    preprocess_regexps = \
            [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
@ -49,19 +53,43 @@ class NYTimes(DefaultProfile):
            br.submit()
        return br
-    def get_feeds(self):
+    def parse_feeds(self):
-        src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read()
+        src = self.browser.open('http://www.nytimes.com/pages/todayspaper/index.html').read().decode('cp1252')
-        soup = BeautifulSoup(src[src.index('<html'):])
+        soup = BeautifulSoup(src)
        feeds = []
        for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
            if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
                                     'Dining & Wine', 'Home & Garden', 'Multimedia',
                                     'Most E-mailed Articles', 
                                     'Automobiles', 'Fashion & Style', 'Television News',
                                     'Education']:
                feeds.append((link['title'], link['href'].replace('graphics8', 'www')))            
-        return feeds
+        def feed_title(div):
            return ''.join(div.findAll(text=True, recursive=False)).strip()
        articles = {}
        key = None
        for div in soup.findAll(True, 
            attrs={'class':['section-headline', 'story', 'story headline']}):
            if div['class'] == 'section-headline':
                key = feed_title(div)
                articles[key] = []
            elif div['class'] in ['story', 'story headline']:
                a = div.find('a', href=True)
                if not a:
                    continue
                url = self.print_version(a['href'])
                title = self.tag_to_string(a, use_alt=True).strip()
                description = ''
                pubdate = time.strftime('%a, %d %b', time.localtime())
                summary = div.find(True, attrs={'class':'summary'})
                if summary:
                    description = self.tag_to_string(summary, use_alt=False)
                feed = key if key is not None else 'Uncategorized'
                if not articles.has_key(feed):
                    articles[feed] = []
                articles[feed].append(
                    dict(title=title, url=url, date=pubdate, description=description,
                         content=''))
        return articles
    def print_version(self, url):
        return url + '?&pagewanted=print'