mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-04 03:27:00 -05:00 
			
		
		
		
	Add --delay option to web2disk
This commit is contained in:
		
							parent
							
								
									00a50740fa
								
							
						
					
					
						commit
						5189cc25c7
					
				@ -15,7 +15,7 @@
 | 
				
			|||||||
'''
 | 
					'''
 | 
				
			||||||
Fetch a webpage and its links recursively.
 | 
					Fetch a webpage and its links recursively.
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
import sys, socket, urllib2, os, urlparse, codecs, logging, re
 | 
					import sys, socket, urllib2, os, urlparse, codecs, logging, re, time
 | 
				
			||||||
from urllib import url2pathname
 | 
					from urllib import url2pathname
 | 
				
			||||||
from httplib import responses
 | 
					from httplib import responses
 | 
				
			||||||
from optparse import OptionParser
 | 
					from optparse import OptionParser
 | 
				
			||||||
@ -28,17 +28,6 @@ logger = logging.getLogger('libprs500.web.fetch.simple')
 | 
				
			|||||||
class FetchError(Exception):
 | 
					class FetchError(Exception):
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def fetch_url(url):
 | 
					 | 
				
			||||||
    f = None
 | 
					 | 
				
			||||||
    logger.info('Fetching %s', url)
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        f = urllib2.urlopen(url)
 | 
					 | 
				
			||||||
    except urllib2.URLError, err:
 | 
					 | 
				
			||||||
        if hasattr(err, 'code') and responses.has_key(err.code):
 | 
					 | 
				
			||||||
            raise FetchError, responses[err.code]
 | 
					 | 
				
			||||||
        raise err
 | 
					 | 
				
			||||||
    return f
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def basename(url):
 | 
					def basename(url):
 | 
				
			||||||
    parts = urlparse.urlsplit(url)
 | 
					    parts = urlparse.urlsplit(url)
 | 
				
			||||||
    path = url2pathname(parts.path)
 | 
					    path = url2pathname(parts.path)
 | 
				
			||||||
@ -68,11 +57,30 @@ class RecursiveFetcher(object):
 | 
				
			|||||||
        self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
 | 
					        self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
 | 
				
			||||||
        self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
 | 
					        self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
 | 
				
			||||||
        self.max_files = options.max_files
 | 
					        self.max_files = options.max_files
 | 
				
			||||||
 | 
					        self.delay = options.delay
 | 
				
			||||||
 | 
					        self.last_fetch_at = 0.
 | 
				
			||||||
        self.filemap = {}
 | 
					        self.filemap = {}
 | 
				
			||||||
        self.imagemap = {}
 | 
					        self.imagemap = {}
 | 
				
			||||||
        self.stylemap = {}
 | 
					        self.stylemap = {}
 | 
				
			||||||
        self.current_dir = self.base_dir
 | 
					        self.current_dir = self.base_dir
 | 
				
			||||||
        self.files = 0
 | 
					        self.files = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def fetch_url(self, url):
 | 
				
			||||||
 | 
					        f = None
 | 
				
			||||||
 | 
					        logger.info('Fetching %s', url)
 | 
				
			||||||
 | 
					        delta = time.time() - self.last_fetch_at 
 | 
				
			||||||
 | 
					        if  delta < self.delay:
 | 
				
			||||||
 | 
					            time.sleep(delta)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            f = urllib2.urlopen(url)
 | 
				
			||||||
 | 
					        except urllib2.URLError, err:
 | 
				
			||||||
 | 
					            if hasattr(err, 'code') and responses.has_key(err.code):
 | 
				
			||||||
 | 
					                raise FetchError, responses[err.code]
 | 
				
			||||||
 | 
					            raise err
 | 
				
			||||||
 | 
					        finally:
 | 
				
			||||||
 | 
					            self.last_fetch_at = time.time()
 | 
				
			||||||
 | 
					        return f
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    def start_fetch(self, url):
 | 
					    def start_fetch(self, url):
 | 
				
			||||||
        soup = BeautifulSoup('<a href="'+url+'" />')
 | 
					        soup = BeautifulSoup('<a href="'+url+'" />')
 | 
				
			||||||
@ -114,7 +122,7 @@ class RecursiveFetcher(object):
 | 
				
			|||||||
                    tag['href'] = self.stylemap[iurl]
 | 
					                    tag['href'] = self.stylemap[iurl]
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
                    f = fetch_url(iurl)
 | 
					                    f = self.fetch_url(iurl)
 | 
				
			||||||
                except Exception, err:
 | 
					                except Exception, err:
 | 
				
			||||||
                    logger.warning('Could not fetch stylesheet %s', iurl)
 | 
					                    logger.warning('Could not fetch stylesheet %s', iurl)
 | 
				
			||||||
                    logger.debug('Error: %s', str(err), exc_info=True)
 | 
					                    logger.debug('Error: %s', str(err), exc_info=True)
 | 
				
			||||||
@ -136,7 +144,7 @@ class RecursiveFetcher(object):
 | 
				
			|||||||
                            ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
 | 
					                            ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
 | 
				
			||||||
                            continue
 | 
					                            continue
 | 
				
			||||||
                        try:
 | 
					                        try:
 | 
				
			||||||
                            f = fetch_url(iurl)
 | 
					                            f = self.fetch_url(iurl)
 | 
				
			||||||
                        except Exception, err:
 | 
					                        except Exception, err:
 | 
				
			||||||
                            logger.warning('Could not fetch stylesheet %s', iurl)
 | 
					                            logger.warning('Could not fetch stylesheet %s', iurl)
 | 
				
			||||||
                            logger.debug('Error: %s', str(err), exc_info=True)
 | 
					                            logger.debug('Error: %s', str(err), exc_info=True)
 | 
				
			||||||
@ -165,7 +173,7 @@ class RecursiveFetcher(object):
 | 
				
			|||||||
                tag['src'] = self.imagemap[iurl]
 | 
					                tag['src'] = self.imagemap[iurl]
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                f = fetch_url(iurl)
 | 
					                f = self.fetch_url(iurl)
 | 
				
			||||||
            except Exception, err:
 | 
					            except Exception, err:
 | 
				
			||||||
                logger.warning('Could not fetch image %s', iurl)
 | 
					                logger.warning('Could not fetch image %s', iurl)
 | 
				
			||||||
                logger.debug('Error: %s', str(err), exc_info=True)
 | 
					                logger.debug('Error: %s', str(err), exc_info=True)
 | 
				
			||||||
@ -234,7 +242,7 @@ class RecursiveFetcher(object):
 | 
				
			|||||||
                    os.mkdir(linkdiskpath)
 | 
					                    os.mkdir(linkdiskpath)
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
                    self.current_dir = linkdiskpath
 | 
					                    self.current_dir = linkdiskpath
 | 
				
			||||||
                    f = fetch_url(iurl)
 | 
					                    f = self.fetch_url(iurl)
 | 
				
			||||||
                    soup = BeautifulSoup(f.read())
 | 
					                    soup = BeautifulSoup(f.read())
 | 
				
			||||||
                    logger.info('Processing images...')
 | 
					                    logger.info('Processing images...')
 | 
				
			||||||
                    self.process_images(soup, f.geturl())
 | 
					                    self.process_images(soup, f.geturl())
 | 
				
			||||||
@ -280,6 +288,8 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
 | 
				
			|||||||
                      help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
 | 
					                      help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
 | 
				
			||||||
    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
 | 
					    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
 | 
				
			||||||
                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
 | 
					                      help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
 | 
				
			||||||
 | 
					    parser.add_option('--delay', default=0, dest='delay', type='int',
 | 
				
			||||||
 | 
					                      help='Minimum interval in seconds between consecutive fetches. Default is %default s')
 | 
				
			||||||
    parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
 | 
					    parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
 | 
				
			||||||
                      default=False, action='store_true', dest='verbose')
 | 
					                      default=False, action='store_true', dest='verbose')
 | 
				
			||||||
    return parser
 | 
					    return parser
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user