From ec79457d453e914a85f715c85c6916db2af5a3dc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 28 Feb 2008 04:21:20 +0000 Subject: [PATCH] Improved NY Times profile that corresponds to the daily paper --- .../ebooks/lrf/web/profiles/nytimes.py | 54 ++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py index 70bc308a81..7b194d5595 100644 --- a/src/libprs500/ebooks/lrf/web/profiles/nytimes.py +++ b/src/libprs500/ebooks/lrf/web/profiles/nytimes.py @@ -15,7 +15,7 @@ ''' Profile to download the New York Times ''' -import re +import re, time from libprs500.ebooks.lrf.web.profiles import DefaultProfile from libprs500.ebooks.BeautifulSoup import BeautifulSoup @@ -26,6 +26,10 @@ class NYTimes(DefaultProfile): timefmt = ' [%a, %d %b, %Y]' needs_subscription = True max_recursions = 2 + recommended_frequency = 1 + encoding = 'cp1252' + html2lrf_options = ['--base-font-size=0'] + preprocess_regexps = \ [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in @@ -49,19 +53,43 @@ class NYTimes(DefaultProfile): br.submit() return br - def get_feeds(self): - src = self.browser.open('http://www.nytimes.com/services/xml/rss/index.html').read() - soup = BeautifulSoup(src[src.index('