mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 18:24:30 -04:00
Add --delay option to web2disk
This commit is contained in:
parent
00a50740fa
commit
5189cc25c7
@ -15,7 +15,7 @@
|
|||||||
'''
|
'''
|
||||||
Fetch a webpage and its links recursively.
|
Fetch a webpage and its links recursively.
|
||||||
'''
|
'''
|
||||||
import sys, socket, urllib2, os, urlparse, codecs, logging, re
|
import sys, socket, urllib2, os, urlparse, codecs, logging, re, time
|
||||||
from urllib import url2pathname
|
from urllib import url2pathname
|
||||||
from httplib import responses
|
from httplib import responses
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
@ -28,17 +28,6 @@ logger = logging.getLogger('libprs500.web.fetch.simple')
|
|||||||
class FetchError(Exception):
|
class FetchError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def fetch_url(url):
|
|
||||||
f = None
|
|
||||||
logger.info('Fetching %s', url)
|
|
||||||
try:
|
|
||||||
f = urllib2.urlopen(url)
|
|
||||||
except urllib2.URLError, err:
|
|
||||||
if hasattr(err, 'code') and responses.has_key(err.code):
|
|
||||||
raise FetchError, responses[err.code]
|
|
||||||
raise err
|
|
||||||
return f
|
|
||||||
|
|
||||||
def basename(url):
|
def basename(url):
|
||||||
parts = urlparse.urlsplit(url)
|
parts = urlparse.urlsplit(url)
|
||||||
path = url2pathname(parts.path)
|
path = url2pathname(parts.path)
|
||||||
@ -68,11 +57,30 @@ class RecursiveFetcher(object):
|
|||||||
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
|
self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
|
||||||
self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
|
self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
|
||||||
self.max_files = options.max_files
|
self.max_files = options.max_files
|
||||||
|
self.delay = options.delay
|
||||||
|
self.last_fetch_at = 0.
|
||||||
self.filemap = {}
|
self.filemap = {}
|
||||||
self.imagemap = {}
|
self.imagemap = {}
|
||||||
self.stylemap = {}
|
self.stylemap = {}
|
||||||
self.current_dir = self.base_dir
|
self.current_dir = self.base_dir
|
||||||
self.files = 0
|
self.files = 0
|
||||||
|
|
||||||
|
def fetch_url(self, url):
|
||||||
|
f = None
|
||||||
|
logger.info('Fetching %s', url)
|
||||||
|
delta = time.time() - self.last_fetch_at
|
||||||
|
if delta < self.delay:
|
||||||
|
time.sleep(delta)
|
||||||
|
try:
|
||||||
|
f = urllib2.urlopen(url)
|
||||||
|
except urllib2.URLError, err:
|
||||||
|
if hasattr(err, 'code') and responses.has_key(err.code):
|
||||||
|
raise FetchError, responses[err.code]
|
||||||
|
raise err
|
||||||
|
finally:
|
||||||
|
self.last_fetch_at = time.time()
|
||||||
|
return f
|
||||||
|
|
||||||
|
|
||||||
def start_fetch(self, url):
|
def start_fetch(self, url):
|
||||||
soup = BeautifulSoup('<a href="'+url+'" />')
|
soup = BeautifulSoup('<a href="'+url+'" />')
|
||||||
@ -114,7 +122,7 @@ class RecursiveFetcher(object):
|
|||||||
tag['href'] = self.stylemap[iurl]
|
tag['href'] = self.stylemap[iurl]
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
f = fetch_url(iurl)
|
f = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
logger.warning('Could not fetch stylesheet %s', iurl)
|
logger.warning('Could not fetch stylesheet %s', iurl)
|
||||||
logger.debug('Error: %s', str(err), exc_info=True)
|
logger.debug('Error: %s', str(err), exc_info=True)
|
||||||
@ -136,7 +144,7 @@ class RecursiveFetcher(object):
|
|||||||
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
f = fetch_url(iurl)
|
f = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
logger.warning('Could not fetch stylesheet %s', iurl)
|
logger.warning('Could not fetch stylesheet %s', iurl)
|
||||||
logger.debug('Error: %s', str(err), exc_info=True)
|
logger.debug('Error: %s', str(err), exc_info=True)
|
||||||
@ -165,7 +173,7 @@ class RecursiveFetcher(object):
|
|||||||
tag['src'] = self.imagemap[iurl]
|
tag['src'] = self.imagemap[iurl]
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
f = fetch_url(iurl)
|
f = self.fetch_url(iurl)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
logger.warning('Could not fetch image %s', iurl)
|
logger.warning('Could not fetch image %s', iurl)
|
||||||
logger.debug('Error: %s', str(err), exc_info=True)
|
logger.debug('Error: %s', str(err), exc_info=True)
|
||||||
@ -234,7 +242,7 @@ class RecursiveFetcher(object):
|
|||||||
os.mkdir(linkdiskpath)
|
os.mkdir(linkdiskpath)
|
||||||
try:
|
try:
|
||||||
self.current_dir = linkdiskpath
|
self.current_dir = linkdiskpath
|
||||||
f = fetch_url(iurl)
|
f = self.fetch_url(iurl)
|
||||||
soup = BeautifulSoup(f.read())
|
soup = BeautifulSoup(f.read())
|
||||||
logger.info('Processing images...')
|
logger.info('Processing images...')
|
||||||
self.process_images(soup, f.geturl())
|
self.process_images(soup, f.geturl())
|
||||||
@ -280,6 +288,8 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
|
|||||||
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
|
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
|
||||||
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
|
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
|
||||||
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
|
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --match-regexp is ignored.')
|
||||||
|
parser.add_option('--delay', default=0, dest='delay', type='int',
|
||||||
|
help='Minimum interval in seconds between consecutive fetches. Default is %default s')
|
||||||
parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
|
parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
|
||||||
default=False, action='store_true', dest='verbose')
|
default=False, action='store_true', dest='verbose')
|
||||||
return parser
|
return parser
|
||||||
|
Loading…
x
Reference in New Issue
Block a user