From f30fac971aad970ac154e2b880b8cb4226c9cddf Mon Sep 17 00:00:00 2001 From: mertalev <101130780+mertalev@users.noreply.github.com> Date: Thu, 19 Dec 2024 20:33:31 -0500 Subject: [PATCH] try mutex for algo cache use OrtMutex --- ...condition-for-rocm-conv-algo-caching.patch | 25 +++ machine-learning/Dockerfile | 14 +- machine-learning/rocm-PR19567.patch | 176 ------------------ 3 files changed, 32 insertions(+), 183 deletions(-) create mode 100644 machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch delete mode 100644 machine-learning/rocm-PR19567.patch diff --git a/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch b/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch new file mode 100644 index 0000000000..a8fa9df0e2 --- /dev/null +++ b/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch @@ -0,0 +1,25 @@ +From e267bc9bab8b3873dba57323ddcd9a9d09a1211e Mon Sep 17 00:00:00 2001 +From: mertalev <101130780+mertalev@users.noreply.github.com> +Date: Fri, 20 Dec 2024 00:59:21 -0500 +Subject: [PATCH] fix: avoid race condition for rocm conv algo caching + +--- + onnxruntime/core/providers/rocm/nn/conv.cc | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc +index d7f47d07a8..ec438287ac 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv.cc +@@ -278,6 +278,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + } + ++ // lock is needed to avoid race condition during algo search ++ std::lock_guard lock(s_.mutex); + if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { + miopenConvAlgoPerf_t perf; + int algo_count = 1; +-- +2.43.0 + diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 4724e52bb5..3d7e0c2cd7 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -23,7 +23,7 @@ WORKDIR /code RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv # Install same version as the Dockerfile provided by onnxruntime -RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \ +RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \ chmod +x cmake-3.27.3-linux-x86_64.sh && \ mkdir -p /code/cmake-3.27.3-linux-x86_64 && \ ./cmake-3.27.3-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.27.3-linux-x86_64 && \ @@ -37,8 +37,8 @@ WORKDIR /code/onnxruntime # EDIT PR # While there's still this PR open, we need to compile on the branch of the PR # https://github.com/microsoft/onnxruntime/pull/19567 -COPY ./rocm-PR19567.patch /tmp/ -RUN git apply /tmp/rocm-PR19567.patch +COPY ./0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch /tmp/ +RUN git apply /tmp/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch # END EDIT PR RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh # I ran into a compilation error when parallelizing the build @@ -77,10 +77,10 @@ FROM prod-cpu AS prod-openvino RUN apt-get update && \ apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \ - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \ - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \ + wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \ + wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \ + wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \ + wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \ dpkg -i *.deb && \ rm *.deb && \ apt-get remove wget -yqq && \ diff --git a/machine-learning/rocm-PR19567.patch b/machine-learning/rocm-PR19567.patch deleted file mode 100644 index 04ad8412eb..0000000000 --- a/machine-learning/rocm-PR19567.patch +++ /dev/null @@ -1,176 +0,0 @@ -From a598a88db258f82a6e4bca75810921bd6bcee7e0 Mon Sep 17 00:00:00 2001 -From: David Nieto -Date: Sat, 17 Feb 2024 11:23:12 -0800 -Subject: [PATCH] Disable algo caching in ROCM EP - -Similar to the work done by Liangxijun-1001 in -https://github.com/apache/tvm/pull/16178 the ROCM spec mandates calling -miopenFindConvolution*Algorithm() before using any Convolution API - -This is the link to the porting guide describing this requirement -https://rocmdocs.amd.com/projects/MIOpen/en/latest/MIOpen_Porting_Guide.html - -Thus, this change disables the algo cache and enforces the official -API semantics - -Signed-off-by: David Nieto ---- - onnxruntime/core/providers/rocm/nn/conv.cc | 61 +++++++++---------- - onnxruntime/core/providers/rocm/nn/conv.h | 6 -- - .../core/providers/rocm/nn/conv_transpose.cc | 17 +++--- - 3 files changed, 36 insertions(+), 48 deletions(-) - -diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc -index 6214ec7bc0ea..b08aceca48b1 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv.cc -+++ b/onnxruntime/core/providers/rocm/nn/conv.cc -@@ -125,10 +125,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - if (input_dims_changed) - s_.last_x_dims = gsl::make_span(x_dims); - -- if (w_dims_changed) { -+ if (w_dims_changed) - s_.last_w_dims = gsl::make_span(w_dims); -- s_.cached_benchmark_fwd_results.clear(); -- } - - ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); - -@@ -277,35 +275,6 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size)); - HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); - } -- -- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { -- miopenConvAlgoPerf_t perf; -- int algo_count = 1; -- const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); -- static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT; -- size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos) -- : AlgoSearchWorkspaceSize; -- IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); -- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm( -- GetMiopenHandle(context), -- s_.x_tensor, -- s_.x_data, -- s_.w_desc, -- s_.w_data, -- s_.conv_desc, -- s_.y_tensor, -- s_.y_data, -- 1, // requestedAlgoCount -- &algo_count, // returnedAlgoCount -- &perf, -- algo_search_workspace.get(), -- max_ws_size, -- false)); // Do not do exhaustive algo search. -- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory}); -- } -- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen); -- s_.fwd_algo = perf.fwd_algo; -- s_.workspace_bytes = perf.memory; - } else { - // set Y - s_.Y = context->Output(0, TensorShape(s_.y_dims)); -@@ -319,6 +288,34 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - s_.y_data = reinterpret_cast(s_.Y->MutableData()); - } - } -+ { -+ /* FindConvolution must always be called by the runtime */ -+ TensorShapeVector x_dims_miopen{x_dims.begin(), x_dims.end()}; -+ miopenConvAlgoPerf_t perf; -+ int algo_count = 1; -+ const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); -+ static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT; -+ size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos) -+ : AlgoSearchWorkspaceSize; -+ IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); -+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm( -+ GetMiopenHandle(context), -+ s_.x_tensor, -+ s_.x_data, -+ s_.w_desc, -+ s_.w_data, -+ s_.conv_desc, -+ s_.y_tensor, -+ s_.y_data, -+ 1, // requestedAlgoCount -+ &algo_count, // returnedAlgoCount -+ &perf, -+ algo_search_workspace.get(), -+ max_ws_size, -+ false)); // Do not do exhaustive algo search. -+ s_.fwd_algo = perf.fwd_algo; -+ s_.workspace_bytes = perf.memory; -+ } - return Status::OK(); - } - -diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h -index bc9846203e57..d54218f25854 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv.h -+++ b/onnxruntime/core/providers/rocm/nn/conv.h -@@ -108,9 +108,6 @@ class lru_unordered_map { - list_type lru_list_; - }; - --// cached miopen descriptors --constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000; -- - template - struct MiopenConvState { - // if x/w dims changed, update algo and miopenTensors -@@ -148,9 +145,6 @@ struct MiopenConvState { - decltype(AlgoPerfType().memory) memory; - }; - -- lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; -- lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; -- - // Some properties needed to support asymmetric padded Conv nodes - bool post_slicing_required; - TensorShapeVector slice_starts; -diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc -index 7447113fdf84..45ed4c8ac37a 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc -+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc -@@ -76,7 +76,6 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy - - if (w_dims_changed) { - s_.last_w_dims = gsl::make_span(w_dims); -- s_.cached_benchmark_bwd_results.clear(); - } - - ConvTransposeAttributes::Prepare p; -@@ -127,12 +126,13 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy - - y_data = reinterpret_cast(p.Y->MutableData()); - -- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) { -- IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); -- -- miopenConvAlgoPerf_t perf; -- int algo_count = 1; -- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm( -+ } -+ // The following is required before calling convolution, we cannot cache the results -+ { -+ IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); -+ miopenConvAlgoPerf_t perf; -+ int algo_count = 1; -+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm( - GetMiopenHandle(context), - s_.x_tensor, - x_data, -@@ -147,10 +147,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy - algo_search_workspace.get(), - AlgoSearchWorkspaceSize, - false)); -- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory}); -- } - -- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims); - s_.bwd_data_algo = perf.bwd_data_algo; - s_.workspace_bytes = perf.memory; - }