mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #2255 (calibre does not handle images with comma in their name)
This commit is contained in:
parent
74cf23ddc8
commit
596b52afac
@ -28,10 +28,10 @@ class closing(object):
|
|||||||
|
|
||||||
def __init__(self, thing):
|
def __init__(self, thing):
|
||||||
self.thing = thing
|
self.thing = thing
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self.thing
|
return self.thing
|
||||||
|
|
||||||
def __exit__(self, *exc_info):
|
def __exit__(self, *exc_info):
|
||||||
try:
|
try:
|
||||||
self.thing.close()
|
self.thing.close()
|
||||||
@ -55,43 +55,43 @@ def save_soup(soup, target):
|
|||||||
for meta in metas:
|
for meta in metas:
|
||||||
if 'charset' in meta.get('content', '').lower():
|
if 'charset' in meta.get('content', '').lower():
|
||||||
meta.replaceWith(nm)
|
meta.replaceWith(nm)
|
||||||
|
|
||||||
selfdir = os.path.dirname(target)
|
selfdir = os.path.dirname(target)
|
||||||
|
|
||||||
for tag in soup.findAll(['img', 'link', 'a']):
|
for tag in soup.findAll(['img', 'link', 'a']):
|
||||||
for key in ('src', 'href'):
|
for key in ('src', 'href'):
|
||||||
path = tag.get(key, None)
|
path = tag.get(key, None)
|
||||||
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
|
if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
|
||||||
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
|
tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
|
||||||
|
|
||||||
html = unicode(soup)
|
html = unicode(soup)
|
||||||
with open(target, 'wb') as f:
|
with open(target, 'wb') as f:
|
||||||
f.write(html.encode('utf-8'))
|
f.write(html.encode('utf-8'))
|
||||||
|
|
||||||
class response(str):
|
class response(str):
|
||||||
|
|
||||||
def __new__(cls, *args):
|
def __new__(cls, *args):
|
||||||
obj = super(response, cls).__new__(cls, *args)
|
obj = super(response, cls).__new__(cls, *args)
|
||||||
obj.newurl = None
|
obj.newurl = None
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
class DummyLock(object):
|
class DummyLock(object):
|
||||||
|
|
||||||
def __enter__(self, *args): return self
|
def __enter__(self, *args): return self
|
||||||
def __exit__(self, *args): pass
|
def __exit__(self, *args): pass
|
||||||
|
|
||||||
class RecursiveFetcher(object, LoggingInterface):
|
class RecursiveFetcher(object, LoggingInterface):
|
||||||
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
|
||||||
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
|
||||||
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
#ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
|
||||||
# (
|
# (
|
||||||
#
|
#
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||||
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
|
||||||
DUMMY_LOCK = DummyLock()
|
DUMMY_LOCK = DummyLock()
|
||||||
|
|
||||||
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
||||||
LoggingInterface.__init__(self, logger)
|
LoggingInterface.__init__(self, logger)
|
||||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||||
@ -123,19 +123,19 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||||
self.download_stylesheets = not options.no_stylesheets
|
self.download_stylesheets = not options.no_stylesheets
|
||||||
self.show_progress = True
|
self.show_progress = True
|
||||||
self.failed_links = []
|
self.failed_links = []
|
||||||
self.job_info = job_info
|
self.job_info = job_info
|
||||||
|
|
||||||
def get_soup(self, src):
|
def get_soup(self, src):
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
nmassage.extend(self.preprocess_regexps)
|
nmassage.extend(self.preprocess_regexps)
|
||||||
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
|
||||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
|
||||||
|
|
||||||
if self.keep_only_tags:
|
if self.keep_only_tags:
|
||||||
body = Tag(soup, 'body')
|
body = Tag(soup, 'body')
|
||||||
try:
|
try:
|
||||||
@ -147,7 +147,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
soup.find('body').replaceWith(body)
|
soup.find('body').replaceWith(body)
|
||||||
except AttributeError: # soup has no body element
|
except AttributeError: # soup has no body element
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def remove_beyond(tag, next):
|
def remove_beyond(tag, next):
|
||||||
while tag is not None and tag.name != 'body':
|
while tag is not None and tag.name != 'body':
|
||||||
after = getattr(tag, next)
|
after = getattr(tag, next)
|
||||||
@ -156,31 +156,34 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
after.extract()
|
after.extract()
|
||||||
after = ns
|
after = ns
|
||||||
tag = tag.parent
|
tag = tag.parent
|
||||||
|
|
||||||
if self.remove_tags_after is not None:
|
if self.remove_tags_after is not None:
|
||||||
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
|
||||||
for spec in rt:
|
for spec in rt:
|
||||||
tag = soup.find(**spec)
|
tag = soup.find(**spec)
|
||||||
remove_beyond(tag, 'nextSibling')
|
remove_beyond(tag, 'nextSibling')
|
||||||
|
|
||||||
if self.remove_tags_before is not None:
|
if self.remove_tags_before is not None:
|
||||||
tag = soup.find(**self.remove_tags_before)
|
tag = soup.find(**self.remove_tags_before)
|
||||||
remove_beyond(tag, 'previousSibling')
|
remove_beyond(tag, 'previousSibling')
|
||||||
|
|
||||||
for kwds in self.remove_tags:
|
for kwds in self.remove_tags:
|
||||||
for tag in soup.findAll(**kwds):
|
for tag in soup.findAll(**kwds):
|
||||||
tag.extract()
|
tag.extract()
|
||||||
return self.preprocess_html_ext(soup)
|
return self.preprocess_html_ext(soup)
|
||||||
|
|
||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
data = None
|
data = None
|
||||||
self.log_debug('Fetching %s', url)
|
self.log_debug('Fetching %s', url)
|
||||||
delta = time.time() - self.last_fetch_at
|
delta = time.time() - self.last_fetch_at
|
||||||
if delta < self.delay:
|
if delta < self.delay:
|
||||||
time.sleep(delta)
|
time.sleep(delta)
|
||||||
if re.search(r'\s+', url) is not None:
|
if re.search(r'\s+|,', url) is not None:
|
||||||
url = quote(url)
|
purl = list(urlparse.urlparse(url))
|
||||||
|
for i in range(2, 6):
|
||||||
|
purl[i] = quote(purl[i])
|
||||||
|
url = urlparse.urlunparse(purl)
|
||||||
with self.browser_lock:
|
with self.browser_lock:
|
||||||
try:
|
try:
|
||||||
with closing(self.browser.open(url)) as f:
|
with closing(self.browser.open(url)) as f:
|
||||||
@ -196,38 +199,38 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
with closing(self.browser.open(url)) as f:
|
with closing(self.browser.open(url)) as f:
|
||||||
data = response(f.read()+f.read())
|
data = response(f.read()+f.read())
|
||||||
data.newurl = f.geturl()
|
data.newurl = f.geturl()
|
||||||
else:
|
else:
|
||||||
raise err
|
raise err
|
||||||
finally:
|
finally:
|
||||||
self.last_fetch_at = time.time()
|
self.last_fetch_at = time.time()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def start_fetch(self, url):
|
def start_fetch(self, url):
|
||||||
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
soup = BeautifulSoup(u'<a href="'+url+'" />')
|
||||||
self.log_info('Downloading')
|
self.log_info('Downloading')
|
||||||
res = self.process_links(soup, url, 0, into_dir='')
|
res = self.process_links(soup, url, 0, into_dir='')
|
||||||
self.log_info('%s saved to %s', url, res)
|
self.log_info('%s saved to %s', url, res)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def is_link_ok(self, url):
|
def is_link_ok(self, url):
|
||||||
for i in self.__class__.LINK_FILTER:
|
for i in self.__class__.LINK_FILTER:
|
||||||
if i.search(url):
|
if i.search(url):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_link_wanted(self, url):
|
def is_link_wanted(self, url):
|
||||||
if self.filter_regexps:
|
if self.filter_regexps:
|
||||||
for f in self.filter_regexps:
|
for f in self.filter_regexps:
|
||||||
if f.search(url):
|
if f.search(url):
|
||||||
return False
|
return False
|
||||||
if self.match_regexps:
|
if self.match_regexps:
|
||||||
for m in self.match_regexps:
|
for m in self.match_regexps:
|
||||||
if m.search(url):
|
if m.search(url):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def process_stylesheets(self, soup, baseurl):
|
def process_stylesheets(self, soup, baseurl):
|
||||||
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
@ -254,7 +257,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
x.write(data)
|
x.write(data)
|
||||||
tag['href'] = stylepath
|
tag['href'] = stylepath
|
||||||
else:
|
else:
|
||||||
for ns in tag.findAll(text=True):
|
for ns in tag.findAll(text=True):
|
||||||
src = str(ns)
|
src = str(ns)
|
||||||
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
|
m = self.__class__.CSS_IMPORT_PATTERN.search(src)
|
||||||
if m:
|
if m:
|
||||||
@ -278,9 +281,9 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
with open(stylepath, 'wb') as x:
|
with open(stylepath, 'wb') as x:
|
||||||
x.write(data)
|
x.write(data)
|
||||||
ns.replaceWith(src.replace(m.group(1), stylepath))
|
ns.replaceWith(src.replace(m.group(1), stylepath))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_images(self, soup, baseurl):
|
def process_images(self, soup, baseurl):
|
||||||
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
|
diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
@ -323,7 +326,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def absurl(self, baseurl, tag, key, filter=True):
|
def absurl(self, baseurl, tag, key, filter=True):
|
||||||
iurl = tag[key]
|
iurl = tag[key]
|
||||||
parts = urlparse.urlsplit(iurl)
|
parts = urlparse.urlsplit(iurl)
|
||||||
if not parts.netloc and not parts.path:
|
if not parts.netloc and not parts.path:
|
||||||
@ -337,26 +340,26 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
self.log_debug('Filtered link: '+iurl)
|
self.log_debug('Filtered link: '+iurl)
|
||||||
return None
|
return None
|
||||||
return iurl
|
return iurl
|
||||||
|
|
||||||
def normurl(self, url):
|
def normurl(self, url):
|
||||||
parts = list(urlparse.urlsplit(url))
|
parts = list(urlparse.urlsplit(url))
|
||||||
parts[4] = ''
|
parts[4] = ''
|
||||||
return urlparse.urlunsplit(parts)
|
return urlparse.urlunsplit(parts)
|
||||||
|
|
||||||
def localize_link(self, tag, key, path):
|
def localize_link(self, tag, key, path):
|
||||||
parts = urlparse.urlsplit(tag[key])
|
parts = urlparse.urlsplit(tag[key])
|
||||||
suffix = '#'+parts.fragment if parts.fragment else ''
|
suffix = '#'+parts.fragment if parts.fragment else ''
|
||||||
tag[key] = path+suffix
|
tag[key] = path+suffix
|
||||||
|
|
||||||
def process_return_links(self, soup, baseurl):
|
def process_return_links(self, soup, baseurl):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
||||||
iurl = self.absurl(baseurl, tag, 'href')
|
iurl = self.absurl(baseurl, tag, 'href')
|
||||||
if not iurl:
|
if not iurl:
|
||||||
continue
|
continue
|
||||||
nurl = self.normurl(iurl)
|
nurl = self.normurl(iurl)
|
||||||
if self.filemap.has_key(nurl):
|
if self.filemap.has_key(nurl):
|
||||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||||
|
|
||||||
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
||||||
res = ''
|
res = ''
|
||||||
diskpath = os.path.join(self.current_dir, into_dir)
|
diskpath = os.path.join(self.current_dir, into_dir)
|
||||||
@ -366,7 +369,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
try:
|
try:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
tags = list(soup.findAll('a', href=True))
|
tags = list(soup.findAll('a', href=True))
|
||||||
|
|
||||||
for c, tag in enumerate(tags):
|
for c, tag in enumerate(tags):
|
||||||
if self.show_progress:
|
if self.show_progress:
|
||||||
print '.',
|
print '.',
|
||||||
@ -396,9 +399,9 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||||
else:
|
else:
|
||||||
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
dsrc = xml_to_unicode(dsrc, self.verbose)[0]
|
||||||
|
|
||||||
soup = self.get_soup(dsrc)
|
soup = self.get_soup(dsrc)
|
||||||
|
|
||||||
base = soup.find('base', href=True)
|
base = soup.find('base', href=True)
|
||||||
if base is not None:
|
if base is not None:
|
||||||
newbaseurl = base['href']
|
newbaseurl = base['href']
|
||||||
@ -406,7 +409,7 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
self.process_images(soup, newbaseurl)
|
self.process_images(soup, newbaseurl)
|
||||||
if self.download_stylesheets:
|
if self.download_stylesheets:
|
||||||
self.process_stylesheets(soup, newbaseurl)
|
self.process_stylesheets(soup, newbaseurl)
|
||||||
|
|
||||||
_fname = basename(iurl)
|
_fname = basename(iurl)
|
||||||
if not isinstance(_fname, unicode):
|
if not isinstance(_fname, unicode):
|
||||||
_fname.decode('latin1', 'replace')
|
_fname.decode('latin1', 'replace')
|
||||||
@ -420,17 +423,17 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
self.log_debug('Processing links...')
|
self.log_debug('Processing links...')
|
||||||
self.process_links(soup, newbaseurl, recursion_level+1)
|
self.process_links(soup, newbaseurl, recursion_level+1)
|
||||||
else:
|
else:
|
||||||
self.process_return_links(soup, newbaseurl)
|
self.process_return_links(soup, newbaseurl)
|
||||||
self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
|
self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||||
|
|
||||||
if callable(self.postprocess_html_ext):
|
if callable(self.postprocess_html_ext):
|
||||||
soup = self.postprocess_html_ext(soup,
|
soup = self.postprocess_html_ext(soup,
|
||||||
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
||||||
self.job_info)
|
self.job_info)
|
||||||
|
|
||||||
if c==0 and recursion_level == 0:
|
if c==0 and recursion_level == 0:
|
||||||
self.called_first = True
|
self.called_first = True
|
||||||
|
|
||||||
save_soup(soup, res)
|
save_soup(soup, res)
|
||||||
self.localize_link(tag, 'href', res)
|
self.localize_link(tag, 'href', res)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
@ -439,34 +442,34 @@ class RecursiveFetcher(object, LoggingInterface):
|
|||||||
self.log_debug('Error: %s', str(err), exc_info=True)
|
self.log_debug('Error: %s', str(err), exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
self.files += 1
|
self.files += 1
|
||||||
finally:
|
finally:
|
||||||
self.current_dir = prev_dir
|
self.current_dir = prev_dir
|
||||||
if self.show_progress:
|
if self.show_progress:
|
||||||
print
|
print
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
dt = getattr(self, 'default_timeout', None)
|
dt = getattr(self, 'default_timeout', None)
|
||||||
if dt is not None:
|
if dt is not None:
|
||||||
socket.setdefaulttimeout(dt)
|
socket.setdefaulttimeout(dt)
|
||||||
|
|
||||||
def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')):
|
def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')):
|
||||||
parser = OptionParser(usage=usage)
|
parser = OptionParser(usage=usage)
|
||||||
parser.add_option('-d', '--base-dir',
|
parser.add_option('-d', '--base-dir',
|
||||||
help=_('Base directory into which URL is saved. Default is %default'),
|
help=_('Base directory into which URL is saved. Default is %default'),
|
||||||
default='.', type='string', dest='dir')
|
default='.', type='string', dest='dir')
|
||||||
parser.add_option('-t', '--timeout',
|
parser.add_option('-t', '--timeout',
|
||||||
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
|
help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
|
||||||
default=10.0, type='float', dest='timeout')
|
default=10.0, type='float', dest='timeout')
|
||||||
parser.add_option('-r', '--max-recursions', default=1,
|
parser.add_option('-r', '--max-recursions', default=1,
|
||||||
help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
|
help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
|
||||||
type='int', dest='max_recursions')
|
type='int', dest='max_recursions')
|
||||||
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
|
parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
|
||||||
help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
|
help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
|
||||||
parser.add_option('--delay', default=0, dest='delay', type='int',
|
parser.add_option('--delay', default=0, dest='delay', type='int',
|
||||||
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
|
||||||
parser.add_option('--encoding', default=None,
|
parser.add_option('--encoding', default=None,
|
||||||
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
|
||||||
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
|
parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
|
||||||
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
|
||||||
@ -487,15 +490,15 @@ def create_fetcher(options, logger=None, image_map={}):
|
|||||||
return RecursiveFetcher(options, logger, image_map={})
|
return RecursiveFetcher(options, logger, image_map={})
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
parser = option_parser()
|
parser = option_parser()
|
||||||
options, args = parser.parse_args(args)
|
options, args = parser.parse_args(args)
|
||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
fetcher = create_fetcher(options)
|
|
||||||
fetcher.start_fetch(args[1])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
fetcher = create_fetcher(options)
|
||||||
|
fetcher.start_fetch(args[1])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user