From 596b52afac5365d3f7c48b017160fc09530e9c95 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 9 Apr 2009 12:47:26 -0700 Subject: [PATCH] Fix #2255 (calibre does not handle images with comma in their name) --- src/calibre/web/fetch/simple.py | 127 ++++++++++++++++---------------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 4da3f4019c..90262056bb 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -28,10 +28,10 @@ class closing(object): def __init__(self, thing): self.thing = thing - + def __enter__(self): return self.thing - + def __exit__(self, *exc_info): try: self.thing.close() @@ -55,43 +55,43 @@ def save_soup(soup, target): for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) - + selfdir = os.path.dirname(target) - + for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) - + html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8')) - + class response(str): - + def __new__(cls, *args): obj = super(response, cls).__new__(cls, *args) obj.newurl = None return obj - + class DummyLock(object): - + def __enter__(self, *args): return self def __exit__(self, *args): pass class RecursiveFetcher(object, LoggingInterface): - LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in + LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$')) #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in # ( - # + # # ) # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ DUMMY_LOCK = DummyLock() - + def __init__(self, options, logger, image_map={}, css_map={}, job_info=None): LoggingInterface.__init__(self, logger) self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) @@ -123,19 +123,19 @@ class RecursiveFetcher(object, LoggingInterface): self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) - self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) + self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self.download_stylesheets = not options.no_stylesheets self.show_progress = True self.failed_links = [] self.job_info = job_info - + def get_soup(self, src): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) - + if self.keep_only_tags: body = Tag(soup, 'body') try: @@ -147,7 +147,7 @@ class RecursiveFetcher(object, LoggingInterface): soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass - + def remove_beyond(tag, next): while tag is not None and tag.name != 'body': after = getattr(tag, next) @@ -156,31 +156,34 @@ class RecursiveFetcher(object, LoggingInterface): after.extract() after = ns tag = tag.parent - + if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') - + if self.remove_tags_before is not None: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') - + for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup) - - + + def fetch_url(self, url): data = None self.log_debug('Fetching %s', url) - delta = time.time() - self.last_fetch_at + delta = time.time() - self.last_fetch_at if delta < self.delay: time.sleep(delta) - if re.search(r'\s+', url) is not None: - url = quote(url) + if re.search(r'\s+|,', url) is not None: + purl = list(urlparse.urlparse(url)) + for i in range(2, 6): + purl[i] = quote(purl[i]) + url = urlparse.urlunparse(purl) with self.browser_lock: try: with closing(self.browser.open(url)) as f: @@ -196,38 +199,38 @@ class RecursiveFetcher(object, LoggingInterface): with closing(self.browser.open(url)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() - else: + else: raise err finally: self.last_fetch_at = time.time() return data - + def start_fetch(self, url): soup = BeautifulSoup(u'') self.log_info('Downloading') res = self.process_links(soup, url, 0, into_dir='') self.log_info('%s saved to %s', url, res) return res - + def is_link_ok(self, url): for i in self.__class__.LINK_FILTER: if i.search(url): return False return True - + def is_link_wanted(self, url): if self.filter_regexps: for f in self.filter_regexps: if f.search(url): - return False + return False if self.match_regexps: for m in self.match_regexps: if m.search(url): return True return False return True - + def process_stylesheets(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) if not os.path.exists(diskpath): @@ -254,7 +257,7 @@ class RecursiveFetcher(object, LoggingInterface): x.write(data) tag['href'] = stylepath else: - for ns in tag.findAll(text=True): + for ns in tag.findAll(text=True): src = str(ns) m = self.__class__.CSS_IMPORT_PATTERN.search(src) if m: @@ -278,9 +281,9 @@ class RecursiveFetcher(object, LoggingInterface): with open(stylepath, 'wb') as x: x.write(data) ns.replaceWith(src.replace(m.group(1), stylepath)) - - - + + + def process_images(self, soup, baseurl): diskpath = unicode_path(os.path.join(self.current_dir, 'images')) if not os.path.exists(diskpath): @@ -323,7 +326,7 @@ class RecursiveFetcher(object, LoggingInterface): traceback.print_exc() continue - def absurl(self, baseurl, tag, key, filter=True): + def absurl(self, baseurl, tag, key, filter=True): iurl = tag[key] parts = urlparse.urlsplit(iurl) if not parts.netloc and not parts.path: @@ -337,26 +340,26 @@ class RecursiveFetcher(object, LoggingInterface): self.log_debug('Filtered link: '+iurl) return None return iurl - + def normurl(self, url): parts = list(urlparse.urlsplit(url)) parts[4] = '' return urlparse.urlunsplit(parts) - + def localize_link(self, tag, key, path): parts = urlparse.urlsplit(tag[key]) suffix = '#'+parts.fragment if parts.fragment else '' tag[key] = path+suffix - + def process_return_links(self, soup, baseurl): for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): - iurl = self.absurl(baseurl, tag, 'href') + iurl = self.absurl(baseurl, tag, 'href') if not iurl: continue nurl = self.normurl(iurl) if self.filemap.has_key(nurl): self.localize_link(tag, 'href', self.filemap[nurl]) - + def process_links(self, soup, baseurl, recursion_level, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) @@ -366,7 +369,7 @@ class RecursiveFetcher(object, LoggingInterface): try: self.current_dir = diskpath tags = list(soup.findAll('a', href=True)) - + for c, tag in enumerate(tags): if self.show_progress: print '.', @@ -396,9 +399,9 @@ class RecursiveFetcher(object, LoggingInterface): dsrc = dsrc.decode(self.encoding, 'ignore') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] - + soup = self.get_soup(dsrc) - + base = soup.find('base', href=True) if base is not None: newbaseurl = base['href'] @@ -406,7 +409,7 @@ class RecursiveFetcher(object, LoggingInterface): self.process_images(soup, newbaseurl) if self.download_stylesheets: self.process_stylesheets(soup, newbaseurl) - + _fname = basename(iurl) if not isinstance(_fname, unicode): _fname.decode('latin1', 'replace') @@ -420,17 +423,17 @@ class RecursiveFetcher(object, LoggingInterface): self.log_debug('Processing links...') self.process_links(soup, newbaseurl, recursion_level+1) else: - self.process_return_links(soup, newbaseurl) + self.process_return_links(soup, newbaseurl) self.log_debug('Recursion limit reached. Skipping links in %s', iurl) - + if callable(self.postprocess_html_ext): - soup = self.postprocess_html_ext(soup, + soup = self.postprocess_html_ext(soup, c==0 and recursion_level==0 and not getattr(self, 'called_first', False), self.job_info) - + if c==0 and recursion_level == 0: self.called_first = True - + save_soup(soup, res) self.localize_link(tag, 'href', res) except Exception, err: @@ -439,34 +442,34 @@ class RecursiveFetcher(object, LoggingInterface): self.log_debug('Error: %s', str(err), exc_info=True) finally: self.current_dir = diskpath - self.files += 1 + self.files += 1 finally: self.current_dir = prev_dir if self.show_progress: print return res - + def __del__(self): dt = getattr(self, 'default_timeout', None) if dt is not None: socket.setdefaulttimeout(dt) - + def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')): parser = OptionParser(usage=usage) - parser.add_option('-d', '--base-dir', + parser.add_option('-d', '--base-dir', help=_('Base directory into which URL is saved. Default is %default'), default='.', type='string', dest='dir') - parser.add_option('-t', '--timeout', + parser.add_option('-t', '--timeout', help=_('Timeout in seconds to wait for a response from the server. Default: %default s'), default=10.0, type='float', dest='timeout') - parser.add_option('-r', '--max-recursions', default=1, + parser.add_option('-r', '--max-recursions', default=1, help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'), type='int', dest='max_recursions') parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files', help=_('The maximum number of files to download. This only applies to files from tags. Default is %default')) parser.add_option('--delay', default=0, dest='delay', type='int', help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) - parser.add_option('--encoding', default=None, + parser.add_option('--encoding', default=None, help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')) @@ -487,15 +490,15 @@ def create_fetcher(options, logger=None, image_map={}): return RecursiveFetcher(options, logger, image_map={}) def main(args=sys.argv): - parser = option_parser() + parser = option_parser() options, args = parser.parse_args(args) if len(args) != 2: parser.print_help() return 1 - - fetcher = create_fetcher(options) - fetcher.start_fetch(args[1]) - -if __name__ == '__main__': + fetcher = create_fetcher(options) + fetcher.start_fetch(args[1]) + + +if __name__ == '__main__': sys.exit(main())