From 0c95bc3d6df80afff85ac8ca0d20bdbf6873f0e5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Aug 2007 02:41:37 +0000 Subject: [PATCH] Improved nytimes profile. --- src/libprs500/ebooks/lrf/web/convert_from.py | 14 +- src/libprs500/ebooks/lrf/web/nytimes.py | 146 +++++++++++++++++++ src/libprs500/ebooks/lrf/web/profiles.py | 30 ++-- 3 files changed, 171 insertions(+), 19 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/web/nytimes.py diff --git a/src/libprs500/ebooks/lrf/web/convert_from.py b/src/libprs500/ebooks/lrf/web/convert_from.py index 59a90f24e3..29622de94e 100644 --- a/src/libprs500/ebooks/lrf/web/convert_from.py +++ b/src/libprs500/ebooks/lrf/web/convert_from.py @@ -39,6 +39,10 @@ def option_parser(): parser.add_option('-u', '--url', dest='url', default=None, help='The URL to download. You only need to specify this if you are not specifying a website_profile.') + parser.add_option('--username', dest='username', default=None, + help='Specify the username to be used while downloading. Only used if the profile supports it.') + parser.add_option('--password', dest='password', default=None, + help='Specify the password to be used while downloading. Only used if the profile supports it.') parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s', default=None, type='int', dest='timeout') parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default', @@ -64,7 +68,7 @@ def fetch_website(options, logger): return fetcher.start_fetch(options.url), tdir def create_lrf(htmlfile, options, logger): - if not options.author: + if not options.author or options.author.lower() == 'unknown': options.author = __appname__ options.header = True if options.output: @@ -83,9 +87,12 @@ def process_profile(args, options, logger=None): if not profiles.has_key(args[1]): raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys())) profile = profiles[args[1]] if len(args) == 2 else profiles['default'] - + profile['username'] = options.username + profile['password'] = options.password if profile.has_key('initialize'): profile['initialize'](profile) + if profile.has_key('browser'): + options.browser = profile['browser'] for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'): val = getattr(options, opt) @@ -104,12 +111,15 @@ def process_profile(args, options, logger=None): options.match_regexps += profile['match_regexps'] options.preprocess_regexps = profile['preprocess_regexps'] options.filter_regexps += profile['filter_regexps'] + if len(args) == 2 and args[1] != 'default': + options.anchor_ids = False htmlfile, tdir = fetch_website(options, logger) create_lrf(htmlfile, options, logger) if profile.has_key('finalize'): profile['finalize'](profile) shutil.rmtree(tdir) + def main(args=sys.argv, logger=None): parser = option_parser() diff --git a/src/libprs500/ebooks/lrf/web/nytimes.py b/src/libprs500/ebooks/lrf/web/nytimes.py new file mode 100644 index 0000000000..d7602405e2 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/nytimes.py @@ -0,0 +1,146 @@ +## Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +'''New York Times from RSS feeds.''' +import time, tempfile, os, shutil, calendar, operator + +from libprs500 import __appname__, iswindows, browser +from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup + +RSS = 'http://www.nytimes.com/services/xml/rss/index.html' +LOGIN = 'http://www.nytimes.com/auth/login' + +def get_feeds(browser): + src = browser.open(RSS).read() + soup = BeautifulSoup(src[src.index('%(title)s [%(date)s]
\n'+\ + u'
%(description)s
\n' + for item in items: + ilist += li%item + return u'''\ + + +

%(title)s

+ + + + '''%dict(title=title, items=ilist.rstrip()) + + cnum = 0 + clist = '' + categories = articles.keys() + categories.sort() + for category in categories: + cnum += 1 + cfile = os.path.join(dir, 'category'+str(cnum)+'.html') + prefix = 'file:' if iswindows else '' + clist += u'
  • %s
  • \n'%(prefix+cfile, category) + src = build_sub_index(category, articles[category]) + open(cfile, 'wb').write(src.encode('utf-8')) + + src = '''\ + + +

    The New York Times

    +
    %(date)s
    + + + + '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist) + index = os.path.join(dir, 'index.html') + open(index, 'wb').write(src.encode('utf-8')) + return index + + +def initialize(profile): + profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_') + profile['browser'] = login(profile) + feeds = get_feeds(profile['browser']) + articles = parse_feeds(feeds, profile['browser']) + index = build_index(articles, profile['temp dir']) + profile['url'] = 'file:'+ ('' if iswindows else '//') + index + + +def finalize(profile): + shutil.rmtree(profile['temp dir']) + + +def login(profile): + br = browser() + if profile['username'] and profile['password']: + br.open(LOGIN) + br.select_form(name='login') + br['USERID'] = profile['username'] + br['PASSWORD'] = profile['password'] + br.submit() + return br + + +if __name__ == '__main__': + feeds = get_feeds() + articles = parse_feeds(feeds) + print articles + diff --git a/src/libprs500/ebooks/lrf/web/profiles.py b/src/libprs500/ebooks/lrf/web/profiles.py index 8ca0a228d6..b076858059 100644 --- a/src/libprs500/ebooks/lrf/web/profiles.py +++ b/src/libprs500/ebooks/lrf/web/profiles.py @@ -17,6 +17,9 @@ import time, re from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize +from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize +from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize + profiles = { 'default' : { @@ -37,32 +40,25 @@ profiles = { }, 'nytimes' : { - 'url' : 'http://nytimesriver.com', + 'initialize' : nytimes_initialize, + 'finalize' : nytimes_finalize, + 'timefmt' : ' [%a, %d %b, %Y]', + 'max_recursions' : 2, 'title' : 'The New York Times', - 'match_regexps' : 'nytimes.com/'+time.strftime('%Y', time.localtime()), 'preprocess_regexps' : [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ - # Remove help link and replace by title - (r'', - lambda match: '

    The New York Times

    \n

    %s

    '%(time.strftime('%a %d %b %Y', time.localtime()),)), - # Blank line before categories - (r'\s*NYT', lambda match: '

    NYT'), - # Blank line between articles - (r'

    \n

    '), - # Remove TimesSelect garbage - (r'.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'), + # Remove header bar + (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'), + (r'<div class="articleTools">.*></ul>', lambda match : ''), + # Remove footer bar + (r'<\!-- end \#article -->.*', lambda match : '</body></html>'), + (r'<div id="footer">.*', lambda match : '</body></html>'), ] ], }, 'bbc' : { - 'url' : 'http://bbcriver.com', 'title' : 'The BBC', 'no_stylesheets' : True, 'preprocess_regexps' :