mirror of
				https://github.com/immich-app/immich.git
				synced 2025-10-26 08:12:33 -04:00 
			
		
		
		
	* feat(ml): introduce support of onnxruntime-rocm for AMD GPU
* try mutex for algo cache
use OrtMutex
* bump versions, run on mich
use 3.12
use 1.19.2
* acquire lock before any changes can be made
guard algo benchmark results
mark mutex as mutable
re-add /bin/sh (?)
use 3.10
use 6.1.2
* use composite cache key
1.19.2
fix variable name
fix variable reference
aaaaaaaaaaaaaaaaaaaa
* bump deps
* disable algo caching
* fix gha
* try ubuntu runner
* actually fix the gha
* update patch
* skip mimalloc preload for rocm
* increase build threads
* increase timeout for rocm
* Revert "increase timeout for rocm"
This reverts commit 2c4452f5d132198ed381a7b262b4a5cab5114b5f.
* attempt migraphx
* set migraphx_home
* Revert "set migraphx_home"
This reverts commit c121d3e48754b3bce100636f8d666deec58a44b7.
* Revert "attempt migraphx"
This reverts commit 521f9fb72dbe506dc6cb8faeb6494817d87265c6.
* migraphx, take two
* bump rocm
* allow cpu
* try only targeting migraphx
* skip tests
* migraph ❌
* known issues
* target gfx900 and gfx1102
* mention `HSA_USE_SVM`
* update lock
* set device id for rocm
---------
Co-authored-by: Mehdi GHESH <mehdi.ghesh@hotmail.fr>
		
	
			
		
			
				
	
	
		
			180 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			180 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| commit 16839b58d9b3c3162a67ce5d776b36d4d24e801f
 | |
| Author: mertalev <101130780+mertalev@users.noreply.github.com>
 | |
| Date:   Wed Mar 5 11:25:38 2025 -0500
 | |
| 
 | |
|     disable algo caching (attributed to @dmnieto in https://github.com/microsoft/onnxruntime/pull/19567)
 | |
| 
 | |
| diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
 | |
| index d7f47d07a8..4060a2af52 100644
 | |
| --- a/onnxruntime/core/providers/rocm/nn/conv.cc
 | |
| +++ b/onnxruntime/core/providers/rocm/nn/conv.cc
 | |
| @@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 | |
| 
 | |
|      if (w_dims_changed) {
 | |
|        s_.last_w_dims = gsl::make_span(w_dims);
 | |
| -      s_.cached_benchmark_fwd_results.clear();
 | |
|      }
 | |
| 
 | |
|      ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
 | |
| @@ -277,35 +276,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 | |
|        HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
 | |
|        HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
 | |
|      }
 | |
| -
 | |
| -    if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
 | |
| -      miopenConvAlgoPerf_t perf;
 | |
| -      int algo_count = 1;
 | |
| -      const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
 | |
| -      static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
 | |
| -      size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
 | |
| -                                                                   : AlgoSearchWorkspaceSize;
 | |
| -      IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
 | |
| -      MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
 | |
| -          GetMiopenHandle(context),
 | |
| -          s_.x_tensor,
 | |
| -          s_.x_data,
 | |
| -          s_.w_desc,
 | |
| -          s_.w_data,
 | |
| -          s_.conv_desc,
 | |
| -          s_.y_tensor,
 | |
| -          s_.y_data,
 | |
| -          1,            // requestedAlgoCount
 | |
| -          &algo_count,  // returnedAlgoCount
 | |
| -          &perf,
 | |
| -          algo_search_workspace.get(),
 | |
| -          max_ws_size,
 | |
| -          false));  // Do not do exhaustive algo search.
 | |
| -      s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
 | |
| -    }
 | |
| -    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
 | |
| -    s_.fwd_algo = perf.fwd_algo;
 | |
| -    s_.workspace_bytes = perf.memory;
 | |
|    } else {
 | |
|      // set Y
 | |
|      s_.Y = context->Output(0, TensorShape(s_.y_dims));
 | |
| @@ -319,6 +289,31 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 | |
|        s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
 | |
|      }
 | |
|    }
 | |
| +
 | |
| +  miopenConvAlgoPerf_t perf;
 | |
| +  int algo_count = 1;
 | |
| +  const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
 | |
| +  static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
 | |
| +  size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
 | |
| +                                                                : AlgoSearchWorkspaceSize;
 | |
| +  IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
 | |
| +  MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
 | |
| +      GetMiopenHandle(context),
 | |
| +      s_.x_tensor,
 | |
| +      s_.x_data,
 | |
| +      s_.w_desc,
 | |
| +      s_.w_data,
 | |
| +      s_.conv_desc,
 | |
| +      s_.y_tensor,
 | |
| +      s_.y_data,
 | |
| +      1,            // requestedAlgoCount
 | |
| +      &algo_count,  // returnedAlgoCount
 | |
| +      &perf,
 | |
| +      algo_search_workspace.get(),
 | |
| +      max_ws_size,
 | |
| +      false));  // Do not do exhaustive algo search.
 | |
| +  s_.fwd_algo = perf.fwd_algo;
 | |
| +  s_.workspace_bytes = perf.memory;
 | |
|    return Status::OK();
 | |
|  }
 | |
| 
 | |
| diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
 | |
| index bc9846203e..d54218f258 100644
 | |
| --- a/onnxruntime/core/providers/rocm/nn/conv.h
 | |
| +++ b/onnxruntime/core/providers/rocm/nn/conv.h
 | |
| @@ -108,9 +108,6 @@ class lru_unordered_map {
 | |
|    list_type lru_list_;
 | |
|  };
 | |
| 
 | |
| -// cached miopen descriptors
 | |
| -constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
 | |
| -
 | |
|  template <typename AlgoPerfType>
 | |
|  struct MiopenConvState {
 | |
|    // if x/w dims changed, update algo and miopenTensors
 | |
| @@ -148,9 +145,6 @@ struct MiopenConvState {
 | |
|      decltype(AlgoPerfType().memory) memory;
 | |
|    };
 | |
| 
 | |
| -  lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
 | |
| -  lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
 | |
| -
 | |
|    // Some properties needed to support asymmetric padded Conv nodes
 | |
|    bool post_slicing_required;
 | |
|    TensorShapeVector slice_starts;
 | |
| diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
 | |
| index 7447113fdf..a662e35b2e 100644
 | |
| --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
 | |
| +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
 | |
| @@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
 | |
| 
 | |
|        if (w_dims_changed) {
 | |
|          s_.last_w_dims = gsl::make_span(w_dims);
 | |
| -        s_.cached_benchmark_bwd_results.clear();
 | |
|        }
 | |
| 
 | |
|        ConvTransposeAttributes::Prepare p;
 | |
| @@ -126,35 +125,29 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
 | |
|        }
 | |
| 
 | |
|        y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
 | |
| -
 | |
| -      if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
 | |
| -        IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 | |
| -
 | |
| -        miopenConvAlgoPerf_t perf;
 | |
| -        int algo_count = 1;
 | |
| -        MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
 | |
| -            GetMiopenHandle(context),
 | |
| -            s_.x_tensor,
 | |
| -            x_data,
 | |
| -            s_.w_desc,
 | |
| -            w_data,
 | |
| -            s_.conv_desc,
 | |
| -            s_.y_tensor,
 | |
| -            y_data,
 | |
| -            1,
 | |
| -            &algo_count,
 | |
| -            &perf,
 | |
| -            algo_search_workspace.get(),
 | |
| -            AlgoSearchWorkspaceSize,
 | |
| -            false));
 | |
| -        s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
 | |
| -      }
 | |
| -
 | |
| -      const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
 | |
| -      s_.bwd_data_algo = perf.bwd_data_algo;
 | |
| -      s_.workspace_bytes = perf.memory;
 | |
|      }
 | |
| 
 | |
| +    IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 | |
| +    miopenConvAlgoPerf_t perf;
 | |
| +    int algo_count = 1;
 | |
| +    MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
 | |
| +        GetMiopenHandle(context),
 | |
| +        s_.x_tensor,
 | |
| +        x_data,
 | |
| +        s_.w_desc,
 | |
| +        w_data,
 | |
| +        s_.conv_desc,
 | |
| +        s_.y_tensor,
 | |
| +        y_data,
 | |
| +        1,
 | |
| +        &algo_count,
 | |
| +        &perf,
 | |
| +        algo_search_workspace.get(),
 | |
| +        AlgoSearchWorkspaceSize,
 | |
| +        false));
 | |
| +    s_.bwd_data_algo = perf.bwd_data_algo;
 | |
| +    s_.workspace_bytes = perf.memory;
 | |
| +
 | |
|      // The following block will be executed in case there has been no change in the shapes of the
 | |
|      // input and the filter compared to the previous run
 | |
|      if (!y_data) {
 |