mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-03 19:17:02 -05:00 
			
		
		
		
	Improved nytimes profile.
This commit is contained in:
		
							parent
							
								
									4ecab43cd0
								
							
						
					
					
						commit
						0c95bc3d6d
					
				@ -39,6 +39,10 @@ def option_parser():
 | 
			
		||||
    parser.add_option('-u', '--url', dest='url', default=None,  
 | 
			
		||||
                      help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
 | 
			
		||||
    
 | 
			
		||||
    parser.add_option('--username', dest='username', default=None, 
 | 
			
		||||
                      help='Specify the username to be used while downloading. Only used if the profile supports it.')
 | 
			
		||||
    parser.add_option('--password', dest='password', default=None,
 | 
			
		||||
                      help='Specify the password to be used while downloading. Only used if the profile supports it.')
 | 
			
		||||
    parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %default s',
 | 
			
		||||
                      default=None, type='int', dest='timeout')
 | 
			
		||||
    parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %default',
 | 
			
		||||
@ -64,7 +68,7 @@ def fetch_website(options, logger):
 | 
			
		||||
    return fetcher.start_fetch(options.url), tdir
 | 
			
		||||
    
 | 
			
		||||
def create_lrf(htmlfile, options, logger):
 | 
			
		||||
    if not options.author:
 | 
			
		||||
    if not options.author or options.author.lower() == 'unknown':
 | 
			
		||||
        options.author = __appname__
 | 
			
		||||
    options.header = True
 | 
			
		||||
    if options.output:
 | 
			
		||||
@ -83,9 +87,12 @@ def process_profile(args, options, logger=None):
 | 
			
		||||
        if not profiles.has_key(args[1]):
 | 
			
		||||
            raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
 | 
			
		||||
    profile = profiles[args[1]] if len(args) == 2 else profiles['default']
 | 
			
		||||
    
 | 
			
		||||
    profile['username'] = options.username
 | 
			
		||||
    profile['password'] = options.password
 | 
			
		||||
    if profile.has_key('initialize'):
 | 
			
		||||
        profile['initialize'](profile)
 | 
			
		||||
    if profile.has_key('browser'):
 | 
			
		||||
        options.browser = profile['browser']
 | 
			
		||||
    
 | 
			
		||||
    for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
 | 
			
		||||
        val = getattr(options, opt)
 | 
			
		||||
@ -104,6 +111,8 @@ def process_profile(args, options, logger=None):
 | 
			
		||||
    options.match_regexps += profile['match_regexps']
 | 
			
		||||
    options.preprocess_regexps = profile['preprocess_regexps']
 | 
			
		||||
    options.filter_regexps += profile['filter_regexps']
 | 
			
		||||
    if len(args) == 2 and args[1] != 'default':
 | 
			
		||||
        options.anchor_ids = False
 | 
			
		||||
    
 | 
			
		||||
    htmlfile, tdir = fetch_website(options, logger)
 | 
			
		||||
    create_lrf(htmlfile, options, logger)
 | 
			
		||||
@ -111,6 +120,7 @@ def process_profile(args, options, logger=None):
 | 
			
		||||
        profile['finalize'](profile)
 | 
			
		||||
    shutil.rmtree(tdir)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def main(args=sys.argv, logger=None):
 | 
			
		||||
    parser = option_parser()
 | 
			
		||||
    options, args = parser.parse_args(args)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										146
									
								
								src/libprs500/ebooks/lrf/web/nytimes.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										146
									
								
								src/libprs500/ebooks/lrf/web/nytimes.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,146 @@
 | 
			
		||||
##    Copyright (C) 2007 Kovid Goyal kovid@kovidgoyal.net
 | 
			
		||||
##    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
##    it under the terms of the GNU General Public License as published by
 | 
			
		||||
##    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
##    (at your option) any later version.
 | 
			
		||||
##
 | 
			
		||||
##    This program is distributed in the hope that it will be useful,
 | 
			
		||||
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
##    GNU General Public License for more details.
 | 
			
		||||
##
 | 
			
		||||
##    You should have received a copy of the GNU General Public License along
 | 
			
		||||
##    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
'''New York Times from RSS feeds.'''
 | 
			
		||||
import time, tempfile, os, shutil, calendar, operator
 | 
			
		||||
 | 
			
		||||
from libprs500 import __appname__, iswindows, browser
 | 
			
		||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
 | 
			
		||||
 | 
			
		||||
RSS = 'http://www.nytimes.com/services/xml/rss/index.html'
 | 
			
		||||
LOGIN = 'http://www.nytimes.com/auth/login'
 | 
			
		||||
 | 
			
		||||
def get_feeds(browser):
 | 
			
		||||
    src = browser.open(RSS).read()
 | 
			
		||||
    soup = BeautifulSoup(src[src.index('<html'):])
 | 
			
		||||
    feeds = []
 | 
			
		||||
    for link in soup.findAll('link', attrs={'type':'application/rss+xml'}):
 | 
			
		||||
        if link['title'] not in ['NYTimes.com Homepage', 'Obituaries', 'Pogue\'s Posts', 
 | 
			
		||||
                                 'Dining & Wine', 'Home & Garden', 'Magazine',
 | 
			
		||||
                                 'Most E-mailed Articles', 
 | 
			
		||||
                                 'Automobiles', 'Fashion & Style', 'Television News',
 | 
			
		||||
                                 'Education']:
 | 
			
		||||
            feeds.append((link['title'], link['href']))
 | 
			
		||||
        #else: print link['title']
 | 
			
		||||
    
 | 
			
		||||
    return feeds
 | 
			
		||||
 | 
			
		||||
def parse_feeds(feeds, browser, max_articles_per_feed=10):
 | 
			
		||||
    articles = {}
 | 
			
		||||
    for title, url in feeds:
 | 
			
		||||
        src = browser.open(url).read()
 | 
			
		||||
        articles[title] = []
 | 
			
		||||
        soup = BeautifulStoneSoup(src)
 | 
			
		||||
        for item in soup.findAll('item'):
 | 
			
		||||
            try:
 | 
			
		||||
                pubdate = item.find('pubdate').string
 | 
			
		||||
                if not pubdate:
 | 
			
		||||
                    continue
 | 
			
		||||
                pubdate = pubdate.replace('+0000', 'GMT')
 | 
			
		||||
                d = { 
 | 
			
		||||
                    'title'    : item.find('title').string,                 
 | 
			
		||||
                    'url'      : item.find('guid').string+'?&pagewanted=print',
 | 
			
		||||
                    'timestamp': calendar.timegm(time.strptime(pubdate, 
 | 
			
		||||
                                                    '%a, %d %b %Y %H:%M:%S %Z')),
 | 
			
		||||
                    'date'     : pubdate
 | 
			
		||||
                    }
 | 
			
		||||
            except:
 | 
			
		||||
                continue
 | 
			
		||||
            try:
 | 
			
		||||
                d['description'] = item.find('description').string
 | 
			
		||||
            except:
 | 
			
		||||
                d['description'] = ''
 | 
			
		||||
            articles[title].append(d)
 | 
			
		||||
        articles[title].sort(key=operator.itemgetter('timestamp'), reverse=True)
 | 
			
		||||
        articles[title][max_articles_per_feed:] = []
 | 
			
		||||
        for item in articles[title]:
 | 
			
		||||
            item.pop('timestamp')
 | 
			
		||||
    return articles
 | 
			
		||||
 | 
			
		||||
def build_index(articles, dir):
 | 
			
		||||
    
 | 
			
		||||
        def build_sub_index(title, items):
 | 
			
		||||
            ilist = ''
 | 
			
		||||
            li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
 | 
			
		||||
                u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
 | 
			
		||||
            for item in items:
 | 
			
		||||
                ilist += li%item
 | 
			
		||||
            return u'''\
 | 
			
		||||
            <html>
 | 
			
		||||
            <body>
 | 
			
		||||
            <h2>%(title)s</h2>
 | 
			
		||||
            <ul>
 | 
			
		||||
            %(items)s
 | 
			
		||||
            </ul>
 | 
			
		||||
            </body>
 | 
			
		||||
            </html>
 | 
			
		||||
            '''%dict(title=title, items=ilist.rstrip())        
 | 
			
		||||
        
 | 
			
		||||
        cnum = 0
 | 
			
		||||
        clist = ''
 | 
			
		||||
        categories = articles.keys()
 | 
			
		||||
        categories.sort()
 | 
			
		||||
        for category in categories:
 | 
			
		||||
            cnum  += 1
 | 
			
		||||
            cfile = os.path.join(dir, 'category'+str(cnum)+'.html')
 | 
			
		||||
            prefix = 'file:' if iswindows else ''
 | 
			
		||||
            clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
 | 
			
		||||
            src = build_sub_index(category, articles[category])
 | 
			
		||||
            open(cfile, 'wb').write(src.encode('utf-8'))        
 | 
			
		||||
        
 | 
			
		||||
        src = '''\
 | 
			
		||||
        <html>
 | 
			
		||||
        <body>
 | 
			
		||||
        <h1>The New York Times</h1>
 | 
			
		||||
        <div style='text-align: right; font-weight: bold'>%(date)s</div>
 | 
			
		||||
        <ul>
 | 
			
		||||
        %(categories)s
 | 
			
		||||
        </ul>
 | 
			
		||||
        </body>
 | 
			
		||||
        </html>
 | 
			
		||||
        '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), categories=clist)
 | 
			
		||||
        index = os.path.join(dir, 'index.html')
 | 
			
		||||
        open(index, 'wb').write(src.encode('utf-8'))
 | 
			
		||||
        return index
 | 
			
		||||
    
 | 
			
		||||
             
 | 
			
		||||
def initialize(profile):
 | 
			
		||||
    profile['temp dir'] = tempfile.mkdtemp(prefix=__appname__+'_')
 | 
			
		||||
    profile['browser'] = login(profile)
 | 
			
		||||
    feeds = get_feeds(profile['browser'])
 | 
			
		||||
    articles = parse_feeds(feeds, profile['browser'])
 | 
			
		||||
    index = build_index(articles, profile['temp dir'])
 | 
			
		||||
    profile['url'] = 'file:'+ ('' if iswindows else '//') + index
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
def finalize(profile):
 | 
			
		||||
    shutil.rmtree(profile['temp dir'])
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
def login(profile):
 | 
			
		||||
    br = browser()
 | 
			
		||||
    if profile['username'] and profile['password']:
 | 
			
		||||
        br.open(LOGIN)
 | 
			
		||||
        br.select_form(name='login')
 | 
			
		||||
        br['USERID']   = profile['username']
 | 
			
		||||
        br['PASSWORD'] = profile['password']
 | 
			
		||||
        br.submit()
 | 
			
		||||
    return br
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    feeds = get_feeds()
 | 
			
		||||
    articles = parse_feeds(feeds)
 | 
			
		||||
    print articles
 | 
			
		||||
 | 
			
		||||
@ -17,6 +17,9 @@ import time, re
 | 
			
		||||
 | 
			
		||||
from libprs500.ebooks.lrf.web.newsweek import initialize as newsweek_initialize
 | 
			
		||||
from libprs500.ebooks.lrf.web.newsweek import finalize as newsweek_finalize
 | 
			
		||||
from libprs500.ebooks.lrf.web.nytimes import initialize as nytimes_initialize
 | 
			
		||||
from libprs500.ebooks.lrf.web.nytimes import finalize as nytimes_finalize
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
profiles = {
 | 
			
		||||
            'default' : {
 | 
			
		||||
@ -37,32 +40,25 @@ profiles = {
 | 
			
		||||
                       },
 | 
			
		||||
                       
 | 
			
		||||
            'nytimes' : {
 | 
			
		||||
                         'url'               : 'http://nytimesriver.com',
 | 
			
		||||
                         'initialize'          : nytimes_initialize,
 | 
			
		||||
                         'finalize'            : nytimes_finalize,
 | 
			
		||||
                         'timefmt'             :  ' [%a, %d %b, %Y]',
 | 
			
		||||
                         'max_recursions'      : 2,                         
 | 
			
		||||
                         'title'             : 'The New York Times',
 | 
			
		||||
                         'match_regexps'     : 'nytimes.com/'+time.strftime('%Y', time.localtime()),
 | 
			
		||||
                         'preprocess_regexps' :
 | 
			
		||||
                         [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
 | 
			
		||||
                          [
 | 
			
		||||
                           # Remove help link and replace by title
 | 
			
		||||
                           (r'<a .*?alt=.Click here for information about this service.*?</a>', 
 | 
			
		||||
                            lambda match: '<h1>The New York Times</h1>\n<p align="right"><b>%s</b></p>'%(time.strftime('%a %d %b %Y', time.localtime()),)),
 | 
			
		||||
                           # Blank line before categories
 | 
			
		||||
                           (r'<b>\s*NYT', lambda match: '<p></p><b>NYT'),
 | 
			
		||||
                           # Blank line between articles
 | 
			
		||||
                           (r'<p><a href', lambda match : '<br /><p><a href'),
 | 
			
		||||
                           # Remove header on individual articles
 | 
			
		||||
                           (r'<body class=.printerversion..*?<h1><nyt_headline', 
 | 
			
		||||
                            lambda match : '<body class="printerversion">\n<h1><nyt_headline'),
 | 
			
		||||
                           # Remove footer from individiual articles
 | 
			
		||||
                           (r'<nyt_update_bottom.*', lambda match : '</body></html>'),
 | 
			
		||||
                           # Remove TimesSelect garbage
 | 
			
		||||
                           (r'<title>.*?TimesSelect', lambda match : 'Downloading of TimesSelect stories is not supported.<!--'),
 | 
			
		||||
                           # Remove header bar
 | 
			
		||||
                           (r'(<body.*?>).*?<h1', lambda match: match.group(1)+'<h1'),
 | 
			
		||||
                           (r'<div class="articleTools">.*></ul>', lambda match : ''),
 | 
			
		||||
                           # Remove footer bar
 | 
			
		||||
                           (r'<\!--  end \#article -->.*', lambda match : '</body></html>'),
 | 
			
		||||
                           (r'<div id="footer">.*', lambda match : '</body></html>'),
 | 
			
		||||
                           ]
 | 
			
		||||
                          ],
 | 
			
		||||
                         },
 | 
			
		||||
                         
 | 
			
		||||
            'bbc'     : {
 | 
			
		||||
                         'url'               : 'http://bbcriver.com',
 | 
			
		||||
                         'title'             : 'The BBC',
 | 
			
		||||
                         'no_stylesheets'    : True,
 | 
			
		||||
                         'preprocess_regexps' :
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user