Fix #2255 (calibre does not handle images with comma in their name)

2025-07-09 03:04:10 -04:00 · 2009-04-09 12:47:26 -07:00 · 2009-04-09 12:47:26 -07:00 · 596b52afac
commit 596b52afac
parent 74cf23ddc8
1 changed files with 65 additions and 62 deletions
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -28,10 +28,10 @@ class closing(object):

    def __init__(self, thing):
        self.thing = thing
-    
+
    def __enter__(self):
        return self.thing
-    
+
    def __exit__(self, *exc_info):
        try:
            self.thing.close()
@ -55,43 +55,43 @@ def save_soup(soup, target):
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
-    
+
    selfdir = os.path.dirname(target)
-    
+
    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
-    
+
    html = unicode(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
-    
+
 class response(str):
-    
+
    def __new__(cls, *args):
        obj = super(response, cls).__new__(cls, *args)
        obj.newurl = None
        return obj
-        
+
 class DummyLock(object):
-    
+
    def __enter__(self, *args): return self
    def __exit__(self, *args): pass

 class RecursiveFetcher(object, LoggingInterface):
-    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 
+    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
                ('.exe\s*$', '.mp3\s*$', '.ogg\s*$', '^\s*mailto:', '^\s*$'))
    #ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
    #                       (
-    #                        
+    #
    #                        )
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
    default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__
    DUMMY_LOCK = DummyLock()
-    
+
    def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
        LoggingInterface.__init__(self, logger)
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
@ -123,19 +123,19 @@ class RecursiveFetcher(object, LoggingInterface):
        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
-        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
+        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
        self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
        self.job_info = job_info
-        
+
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup
        soup = BeautifulSoup(xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)
-         
+
        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
@ -147,7 +147,7 @@ class RecursiveFetcher(object, LoggingInterface):
                soup.find('body').replaceWith(body)
            except AttributeError: # soup has no body element
                pass
-            
+
        def remove_beyond(tag, next):
            while tag is not None and tag.name != 'body':
                after = getattr(tag, next)
@ -156,31 +156,34 @@ class RecursiveFetcher(object, LoggingInterface):
                    after.extract()
                    after = ns
                tag = tag.parent
-        
+
        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')
-            
+
        if self.remove_tags_before is not None:
            tag = soup.find(**self.remove_tags_before)
            remove_beyond(tag, 'previousSibling')
-            
+
        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
-        
-    
+
+
    def fetch_url(self, url):
        data = None
        self.log_debug('Fetching %s', url)
-        delta = time.time() - self.last_fetch_at 
+        delta = time.time() - self.last_fetch_at
        if  delta < self.delay:
            time.sleep(delta)
-        if re.search(r'\s+', url) is not None:
-            url = quote(url)
+        if re.search(r'\s+|,', url) is not None:
+            purl = list(urlparse.urlparse(url))
+            for i in range(2, 6):
+                purl[i] = quote(purl[i])
+            url = urlparse.urlunparse(purl)
        with self.browser_lock:
            try:
                with closing(self.browser.open(url)) as f:
@ -196,38 +199,38 @@ class RecursiveFetcher(object, LoggingInterface):
                    with closing(self.browser.open(url)) as f:
                        data = response(f.read()+f.read())
                        data.newurl = f.geturl()
-                else: 
+                else:
                    raise err
            finally:
                self.last_fetch_at = time.time()
            return data

-        
+
    def start_fetch(self, url):
        soup = BeautifulSoup(u'<a href="'+url+'" />')
        self.log_info('Downloading')
        res = self.process_links(soup, url, 0, into_dir='')
        self.log_info('%s saved to %s', url, res)
        return res
-    
+
    def is_link_ok(self, url):
        for i in self.__class__.LINK_FILTER:
            if i.search(url):
                return False
        return True
-        
+
    def is_link_wanted(self, url):
        if self.filter_regexps:
            for f in self.filter_regexps:
                if f.search(url):
-                    return False            
+                    return False
        if self.match_regexps:
            for m in self.match_regexps:
                if m.search(url):
                    return True
            return False
        return True
-        
+
    def process_stylesheets(self, soup, baseurl):
        diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
        if not os.path.exists(diskpath):
@ -254,7 +257,7 @@ class RecursiveFetcher(object, LoggingInterface):
                    x.write(data)
                tag['href'] = stylepath
            else:
-                for ns in tag.findAll(text=True):                    
+                for ns in tag.findAll(text=True):
                    src = str(ns)
                    m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                    if m:
@ -278,9 +281,9 @@ class RecursiveFetcher(object, LoggingInterface):
                        with open(stylepath, 'wb') as x:
                            x.write(data)
                        ns.replaceWith(src.replace(m.group(1), stylepath))
-                        
-                        
-    
+
+
+
    def process_images(self, soup, baseurl):
        diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
        if not os.path.exists(diskpath):
@ -323,7 +326,7 @@ class RecursiveFetcher(object, LoggingInterface):
                traceback.print_exc()
                continue

-    def absurl(self, baseurl, tag, key, filter=True): 
+    def absurl(self, baseurl, tag, key, filter=True):
        iurl = tag[key]
        parts = urlparse.urlsplit(iurl)
        if not parts.netloc and not parts.path:
@ -337,26 +340,26 @@ class RecursiveFetcher(object, LoggingInterface):
            self.log_debug('Filtered link: '+iurl)
            return None
        return iurl
-    
+
    def normurl(self, url):
        parts = list(urlparse.urlsplit(url))
        parts[4] = ''
        return urlparse.urlunsplit(parts)
-                
+
    def localize_link(self, tag, key, path):
        parts = urlparse.urlsplit(tag[key])
        suffix = '#'+parts.fragment if parts.fragment else ''
        tag[key] = path+suffix
-    
+
    def process_return_links(self, soup, baseurl):
        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
-            iurl = self.absurl(baseurl, tag, 'href')            
+            iurl = self.absurl(baseurl, tag, 'href')
            if not iurl:
                continue
            nurl = self.normurl(iurl)
            if self.filemap.has_key(nurl):
                self.localize_link(tag, 'href', self.filemap[nurl])
-    
+
    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
        res = ''
        diskpath = os.path.join(self.current_dir, into_dir)
@ -366,7 +369,7 @@ class RecursiveFetcher(object, LoggingInterface):
        try:
            self.current_dir = diskpath
            tags = list(soup.findAll('a', href=True))
-            
+
            for c, tag in enumerate(tags):
                if self.show_progress:
                    print '.',
@ -396,9 +399,9 @@ class RecursiveFetcher(object, LoggingInterface):
                        dsrc = dsrc.decode(self.encoding, 'ignore')
                    else:
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
-                    
+
                    soup = self.get_soup(dsrc)
-                    
+
                    base = soup.find('base', href=True)
                    if base is not None:
                        newbaseurl = base['href']
@ -406,7 +409,7 @@ class RecursiveFetcher(object, LoggingInterface):
                    self.process_images(soup, newbaseurl)
                    if self.download_stylesheets:
                        self.process_stylesheets(soup, newbaseurl)
-                    
+
                    _fname = basename(iurl)
                    if not isinstance(_fname, unicode):
                        _fname.decode('latin1', 'replace')
@ -420,17 +423,17 @@ class RecursiveFetcher(object, LoggingInterface):
                        self.log_debug('Processing links...')
                        self.process_links(soup, newbaseurl, recursion_level+1)
                    else:
-                        self.process_return_links(soup, newbaseurl) 
+                        self.process_return_links(soup, newbaseurl)
                        self.log_debug('Recursion limit reached. Skipping links in %s', iurl)
-                    
+
                    if callable(self.postprocess_html_ext):
-                        soup = self.postprocess_html_ext(soup, 
+                        soup = self.postprocess_html_ext(soup,
                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                self.job_info)
-                        
+
                        if c==0 and recursion_level == 0:
                            self.called_first = True
-                    
+
                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception, err:
@ -439,34 +442,34 @@ class RecursiveFetcher(object, LoggingInterface):
                    self.log_debug('Error: %s', str(err), exc_info=True)
                finally:
                    self.current_dir = diskpath
-                    self.files += 1                
+                    self.files += 1
        finally:
            self.current_dir = prev_dir
        if self.show_progress:
            print
        return res
-    
+
    def __del__(self):
        dt = getattr(self, 'default_timeout', None)
        if dt is not None:
            socket.setdefaulttimeout(dt)
-        
+
 def option_parser(usage=_('%prog URL\n\nWhere URL is for example http://google.com')):
    parser = OptionParser(usage=usage)
-    parser.add_option('-d', '--base-dir', 
+    parser.add_option('-d', '--base-dir',
                      help=_('Base directory into which URL is saved. Default is %default'),
                      default='.', type='string', dest='dir')
-    parser.add_option('-t', '--timeout', 
+    parser.add_option('-t', '--timeout',
                      help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
                      default=10.0, type='float', dest='timeout')
-    parser.add_option('-r', '--max-recursions', default=1, 
+    parser.add_option('-r', '--max-recursions', default=1,
                      help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
                      type='int', dest='max_recursions')
    parser.add_option('-n', '--max-files', default=sys.maxint, type='int', dest='max_files',
                      help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
    parser.add_option('--delay', default=0, dest='delay', type='int',
                      help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
-    parser.add_option('--encoding', default=None, 
+    parser.add_option('--encoding', default=None,
                      help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
    parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
                      help=_('Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.'))
@ -487,15 +490,15 @@ def create_fetcher(options, logger=None, image_map={}):
    return RecursiveFetcher(options, logger, image_map={})

 def main(args=sys.argv):
-    parser = option_parser()    
+    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) != 2:
        parser.print_help()
        return 1
-    
-    fetcher = create_fetcher(options) 
-    fetcher.start_fetch(args[1])
-    

-if __name__ == '__main__':    
+    fetcher = create_fetcher(options)
+    fetcher.start_fetch(args[1])
+
+
+if __name__ == '__main__':
    sys.exit(main())