try mutex for algo cache

use OrtMutex
2025-11-18 20:43:20 -05:00 · 2024-12-19 20:33:31 -05:00 · 2024-12-19 20:33:31 -05:00 · f30fac971a
commit f30fac971a
parent fe26ccd1b7
3 changed files with 32 additions and 183 deletions
--- a/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch
+++ b/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch
@ -0,0 +1,25 @@
 From e267bc9bab8b3873dba57323ddcd9a9d09a1211e Mon Sep 17 00:00:00 2001
 From: mertalev <101130780+mertalev@users.noreply.github.com>
 Date: Fri, 20 Dec 2024 00:59:21 -0500
 Subject: [PATCH] fix: avoid race condition for rocm conv algo caching
 ---
 onnxruntime/core/providers/rocm/nn/conv.cc | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
 index d7f47d07a8..ec438287ac 100644
 --- a/onnxruntime/core/providers/rocm/nn/conv.cc
 +++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -278,6 +278,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
       HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
     }
 +    // lock is needed to avoid race condition during algo search
 +    std::lock_guard<OrtMutex> lock(s_.mutex);
     if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
       miopenConvAlgoPerf_t perf;
       int algo_count = 1;
 -- 
 2.43.0
--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@ -23,7 +23,7 @@ WORKDIR /code
 RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
 # Install same version as the Dockerfile provided by onnxruntime
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \
+RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \
    chmod +x cmake-3.27.3-linux-x86_64.sh && \
    mkdir -p /code/cmake-3.27.3-linux-x86_64 && \
    ./cmake-3.27.3-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.27.3-linux-x86_64 && \
@ -37,8 +37,8 @@ WORKDIR /code/onnxruntime
 # EDIT PR
 # While there's still this PR open, we need to compile on the branch of the PR
 # https://github.com/microsoft/onnxruntime/pull/19567
-COPY ./rocm-PR19567.patch /tmp/
+COPY ./0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch /tmp/
-RUN git apply /tmp/rocm-PR19567.patch
+RUN git apply /tmp/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch
 # END EDIT PR
 RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
 # I ran into a compilation error when parallelizing the build
@ -77,10 +77,10 @@ FROM prod-cpu AS prod-openvino
 RUN apt-get update && \
    apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \
-    wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
+    wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
-    wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
+    wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
-    wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
+    wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
-    wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
+    wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
    dpkg -i *.deb && \
    rm *.deb && \
    apt-get remove wget -yqq && \
--- a/machine-learning/rocm-PR19567.patch
+++ b/machine-learning/rocm-PR19567.patch
@ -1,176 +0,0 @@
 From a598a88db258f82a6e4bca75810921bd6bcee7e0 Mon Sep 17 00:00:00 2001
 From: David Nieto <dmnieto@gmail.com>
 Date: Sat, 17 Feb 2024 11:23:12 -0800
 Subject: [PATCH] Disable algo caching in ROCM EP
 Similar to the work done by Liangxijun-1001 in
 https://github.com/apache/tvm/pull/16178 the ROCM spec mandates calling
 miopenFindConvolution*Algorithm() before using any Convolution API
 This is the link to the porting guide describing this requirement
 https://rocmdocs.amd.com/projects/MIOpen/en/latest/MIOpen_Porting_Guide.html
 Thus, this change disables the algo cache and enforces the official
 API semantics
 Signed-off-by: David Nieto <dmnieto@gmail.com>
 ---
 onnxruntime/core/providers/rocm/nn/conv.cc    | 61 +++++++++----------
 onnxruntime/core/providers/rocm/nn/conv.h     |  6 --
 .../core/providers/rocm/nn/conv_transpose.cc  | 17 +++---
 3 files changed, 36 insertions(+), 48 deletions(-)
 diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
 index 6214ec7bc0ea..b08aceca48b1 100644
 --- a/onnxruntime/core/providers/rocm/nn/conv.cc
 +++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -125,10 +125,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
     if (input_dims_changed)
       s_.last_x_dims = gsl::make_span(x_dims);
 -    if (w_dims_changed) {
 +    if (w_dims_changed)
       s_.last_w_dims = gsl::make_span(w_dims);
 -      s_.cached_benchmark_fwd_results.clear();
 -    }
     ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
@@ -277,35 +275,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
       HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
       HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
     }
 -
 -    if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
 -      miopenConvAlgoPerf_t perf;
 -      int algo_count = 1;
 -      const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
 -      static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
 -      size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos)
 -                                                                   : AlgoSearchWorkspaceSize;
 -      IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
 -      MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
 -          GetMiopenHandle(context),
 -          s_.x_tensor,
 -          s_.x_data,
 -          s_.w_desc,
 -          s_.w_data,
 -          s_.conv_desc,
 -          s_.y_tensor,
 -          s_.y_data,
 -          1,            // requestedAlgoCount
 -          &algo_count,  // returnedAlgoCount
 -          &perf,
 -          algo_search_workspace.get(),
 -          max_ws_size,
 -          false));  // Do not do exhaustive algo search.
 -      s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
 -    }
 -    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
 -    s_.fwd_algo = perf.fwd_algo;
 -    s_.workspace_bytes = perf.memory;
   } else {
     // set Y
     s_.Y = context->Output(0, TensorShape(s_.y_dims));
@@ -319,6 +288,34 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
       s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
     }
   }
 +  {
 +    /* FindConvolution must always be called by the runtime */
 +    TensorShapeVector x_dims_miopen{x_dims.begin(), x_dims.end()};
 +    miopenConvAlgoPerf_t perf;
 +    int algo_count = 1;
 +    const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
 +    static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
 +    size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos)
 +                                                                   : AlgoSearchWorkspaceSize;
 +    IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
 +    MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
 +        GetMiopenHandle(context),
 +        s_.x_tensor,
 +        s_.x_data,
 +        s_.w_desc,
 +        s_.w_data,
 +        s_.conv_desc,
 +        s_.y_tensor,
 +        s_.y_data,
 +        1,            // requestedAlgoCount
 +        &algo_count,  // returnedAlgoCount
 +        &perf,
 +        algo_search_workspace.get(),
 +        max_ws_size,
 +        false));  // Do not do exhaustive algo search.
 +    s_.fwd_algo = perf.fwd_algo;
 +    s_.workspace_bytes = perf.memory;
 +  }
   return Status::OK();
 }
 diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
 index bc9846203e57..d54218f25854 100644
 --- a/onnxruntime/core/providers/rocm/nn/conv.h
 +++ b/onnxruntime/core/providers/rocm/nn/conv.h
@@ -108,9 +108,6 @@ class lru_unordered_map {
   list_type lru_list_;
 };
 -// cached miopen descriptors
 -constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
 -
 template <typename AlgoPerfType>
 struct MiopenConvState {
   // if x/w dims changed, update algo and miopenTensors
@@ -148,9 +145,6 @@ struct MiopenConvState {
     decltype(AlgoPerfType().memory) memory;
   };
 -  lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
 -  lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
 -
   // Some properties needed to support asymmetric padded Conv nodes
   bool post_slicing_required;
   TensorShapeVector slice_starts;
 diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
 index 7447113fdf84..45ed4c8ac37a 100644
 --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
 +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       if (w_dims_changed) {
         s_.last_w_dims = gsl::make_span(w_dims);
 -        s_.cached_benchmark_bwd_results.clear();
       }
       ConvTransposeAttributes::Prepare p;
@@ -127,12 +126,13 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
 -      if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
 -        IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 -
 -        miopenConvAlgoPerf_t perf;
 -        int algo_count = 1;
 -        MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
 +    }
 +    // The following is required before calling convolution, we cannot cache the results
 +    {
 +      IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 +      miopenConvAlgoPerf_t perf;
 +      int algo_count = 1;
 +      MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
             GetMiopenHandle(context),
             s_.x_tensor,
             x_data,
@@ -147,10 +147,7 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
             algo_search_workspace.get(),
             AlgoSearchWorkspaceSize,
             false));
 -        s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
 -      }
 -      const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
       s_.bwd_data_algo = perf.bwd_data_algo;
       s_.workspace_bytes = perf.memory;
     }