From f19cf206baae585aa0b65f370caf1d19a72efdf9 Mon Sep 17 00:00:00 2001 From: mertalev <101130780+mertalev@users.noreply.github.com> Date: Wed, 5 Mar 2025 09:36:27 -0500 Subject: [PATCH] disable algo caching --- .../0001-fix-rocm-conv-thread-safety.patch | 150 --------------- machine-learning/Dockerfile | 13 +- machine-learning/rocm-PR19567.patch | 176 ++++++++++++++++++ 3 files changed, 181 insertions(+), 158 deletions(-) delete mode 100644 machine-learning/0001-fix-rocm-conv-thread-safety.patch create mode 100644 machine-learning/rocm-PR19567.patch diff --git a/machine-learning/0001-fix-rocm-conv-thread-safety.patch b/machine-learning/0001-fix-rocm-conv-thread-safety.patch deleted file mode 100644 index bd0e759f69..0000000000 --- a/machine-learning/0001-fix-rocm-conv-thread-safety.patch +++ /dev/null @@ -1,150 +0,0 @@ -From 350e3237eadb738a0d96295a62f2eed96653c315 Mon Sep 17 00:00:00 2001 -From: mertalev <101130780+mertalev@users.noreply.github.com> -Date: Fri, 20 Dec 2024 00:59:21 -0500 -Subject: [PATCH 1/1] fix: avoid race condition for rocm conv algo caching - ---- - onnxruntime/core/providers/rocm/nn/conv.cc | 8 ++++---- - onnxruntime/core/providers/rocm/nn/conv.h | 14 ++++++++++++-- - .../core/providers/rocm/nn/conv_transpose.cc | 8 ++++---- - 3 files changed, 20 insertions(+), 10 deletions(-) - -diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc -index d7f47d07a8..98b6b69212 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv.cc -+++ b/onnxruntime/core/providers/rocm/nn/conv.cc -@@ -127,7 +127,6 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - - if (w_dims_changed) { - s_.last_w_dims = gsl::make_span(w_dims); -- s_.cached_benchmark_fwd_results.clear(); - } - - ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); -@@ -278,7 +277,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); - } - -- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { -+ const std::size_t algo_key = HashConvAlgoKey(x_dims_miopen, w_dims); -+ if (!s_.cached_benchmark_fwd_results.contains(algo_key)) { - miopenConvAlgoPerf_t perf; - int algo_count = 1; - const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); -@@ -301,9 +301,9 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - algo_search_workspace.get(), - max_ws_size, - false)); // Do not do exhaustive algo search. -- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory}); -+ s_.cached_benchmark_fwd_results.insert(algo_key, {perf.fwd_algo, perf.memory}); - } -- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen); -+ const auto& perf = s_.cached_benchmark_fwd_results.at(algo_key); - s_.fwd_algo = perf.fwd_algo; - s_.workspace_bytes = perf.memory; - } else { -diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h -index bc9846203e..b1ca5f8e4b 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv.h -+++ b/onnxruntime/core/providers/rocm/nn/conv.h -@@ -43,6 +43,11 @@ struct vector_hash { - } - }; - -+inline std::size_t HashConvAlgoKey(const TensorShapeVector& x_dims, const TensorShapeVector& w_dims) { -+ vector_hash vh; -+ return vh(x_dims) ^ vh(w_dims); -+} -+ - template , - typename KeyEqual = std::equal_to, -@@ -52,6 +57,7 @@ class lru_unordered_map { - lru_unordered_map(size_t max_size) : max_size_(max_size) {} - - void insert(const Key& key, const T& value) { -+ std::lock_guard guard(mutex_); - auto it = items_.find(key); - if (it != items_.end()) { - it->second.value = value; -@@ -69,6 +75,7 @@ class lru_unordered_map { - } - - T& at(const Key& key) { -+ std::lock_guard guard(mutex_); - auto it = items_.find(key); - if (it == items_.end()) { - throw std::out_of_range("There is no such key in cache"); -@@ -78,6 +85,7 @@ class lru_unordered_map { - } - - bool contains(const Key& key) const { -+ std::lock_guard guard(mutex_); - return items_.find(key) != items_.end(); - } - -@@ -86,6 +94,7 @@ class lru_unordered_map { - } - - void clear() { -+ std::lock_guard guard(mutex_); - items_.clear(); - lru_list_.clear(); - } -@@ -106,6 +115,7 @@ class lru_unordered_map { - size_t max_size_; - std::unordered_map items_; - list_type lru_list_; -+ mutable std::mutex mutex_; - }; - - // cached miopen descriptors -@@ -148,8 +158,8 @@ struct MiopenConvState { - decltype(AlgoPerfType().memory) memory; - }; - -- lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; -- lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; -+ lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; -+ lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; - - // Some properties needed to support asymmetric padded Conv nodes - bool post_slicing_required; -diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc -index 7447113fdf..dea9bf2a05 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc -+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc -@@ -76,7 +76,6 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy - - if (w_dims_changed) { - s_.last_w_dims = gsl::make_span(w_dims); -- s_.cached_benchmark_bwd_results.clear(); - } - - ConvTransposeAttributes::Prepare p; -@@ -127,7 +126,8 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy - - y_data = reinterpret_cast(p.Y->MutableData()); - -- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) { -+ const std::size_t algo_key = HashConvAlgoKey(x_dims, w_dims); -+ if (!s_.cached_benchmark_bwd_results.contains(algo_key)) { - IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); - - miopenConvAlgoPerf_t perf; -@@ -147,10 +147,10 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy - algo_search_workspace.get(), - AlgoSearchWorkspaceSize, - false)); -- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory}); -+ s_.cached_benchmark_bwd_results.insert(algo_key, {perf.bwd_data_algo, perf.memory}); - } - -- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims); -+ const auto& perf = s_.cached_benchmark_bwd_results.at(algo_key); - s_.bwd_data_algo = perf.bwd_data_algo; - s_.workspace_bytes = perf.memory; - } --- -2.43.0 - diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 88bedd8e49..2a82715e3a 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -15,15 +15,13 @@ RUN mkdir /opt/armnn && \ cd /opt/ann && \ sh build.sh -# Warning: 26.3Gb of disk space required to pull this image -# https://github.com/microsoft/onnxruntime/blob/main/dockerfiles/Dockerfile.rocm -# 6.2 or later fails to build as of writing +# Warning: 25GiB+ disk space required to pull this image +# TODO: find a way to reduce the image size FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS builder-rocm WORKDIR /code RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv -# Install same version as the Dockerfile provided by onnxruntime RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.sh && \ chmod +x cmake-3.30.1-linux-x86_64.sh && \ mkdir -p /code/cmake-3.30.1-linux-x86_64 && \ @@ -32,13 +30,12 @@ RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3. ENV PATH=/code/cmake-3.30.1-linux-x86_64/bin:${PATH} -# Prepare onnxruntime repository & build onnxruntime -# 1.20.1 fails to build as of writing RUN git clone --single-branch --branch v1.20.1 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime WORKDIR /code/onnxruntime # Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567 -COPY ./0001-fix-rocm-conv-thread-safety.patch /tmp/ -RUN git apply /tmp/0001-fix-rocm-conv-thread-safety.patch +# TODO: find a way to fix this without disabling algo caching +COPY ./rocm-PR19567.patch /tmp/ +RUN git apply /tmp/rocm-PR19567.patch RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh # Note: the `parallel` setting uses a substantial amount of RAM diff --git a/machine-learning/rocm-PR19567.patch b/machine-learning/rocm-PR19567.patch new file mode 100644 index 0000000000..04ad8412eb --- /dev/null +++ b/machine-learning/rocm-PR19567.patch @@ -0,0 +1,176 @@ +From a598a88db258f82a6e4bca75810921bd6bcee7e0 Mon Sep 17 00:00:00 2001 +From: David Nieto +Date: Sat, 17 Feb 2024 11:23:12 -0800 +Subject: [PATCH] Disable algo caching in ROCM EP + +Similar to the work done by Liangxijun-1001 in +https://github.com/apache/tvm/pull/16178 the ROCM spec mandates calling +miopenFindConvolution*Algorithm() before using any Convolution API + +This is the link to the porting guide describing this requirement +https://rocmdocs.amd.com/projects/MIOpen/en/latest/MIOpen_Porting_Guide.html + +Thus, this change disables the algo cache and enforces the official +API semantics + +Signed-off-by: David Nieto +--- + onnxruntime/core/providers/rocm/nn/conv.cc | 61 +++++++++---------- + onnxruntime/core/providers/rocm/nn/conv.h | 6 -- + .../core/providers/rocm/nn/conv_transpose.cc | 17 +++--- + 3 files changed, 36 insertions(+), 48 deletions(-) + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc +index 6214ec7bc0ea..b08aceca48b1 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv.cc +@@ -125,10 +125,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + if (input_dims_changed) + s_.last_x_dims = gsl::make_span(x_dims); + +- if (w_dims_changed) { ++ if (w_dims_changed) + s_.last_w_dims = gsl::make_span(w_dims); +- s_.cached_benchmark_fwd_results.clear(); +- } + + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); + +@@ -277,35 +275,6 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size)); + HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + } +- +- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { +- miopenConvAlgoPerf_t perf; +- int algo_count = 1; +- const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); +- static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT; +- size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos) +- : AlgoSearchWorkspaceSize; +- IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); +- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm( +- GetMiopenHandle(context), +- s_.x_tensor, +- s_.x_data, +- s_.w_desc, +- s_.w_data, +- s_.conv_desc, +- s_.y_tensor, +- s_.y_data, +- 1, // requestedAlgoCount +- &algo_count, // returnedAlgoCount +- &perf, +- algo_search_workspace.get(), +- max_ws_size, +- false)); // Do not do exhaustive algo search. +- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory}); +- } +- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen); +- s_.fwd_algo = perf.fwd_algo; +- s_.workspace_bytes = perf.memory; + } else { + // set Y + s_.Y = context->Output(0, TensorShape(s_.y_dims)); +@@ -319,6 +288,34 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + s_.y_data = reinterpret_cast(s_.Y->MutableData()); + } + } ++ { ++ /* FindConvolution must always be called by the runtime */ ++ TensorShapeVector x_dims_miopen{x_dims.begin(), x_dims.end()}; ++ miopenConvAlgoPerf_t perf; ++ int algo_count = 1; ++ const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); ++ static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT; ++ size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos) ++ : AlgoSearchWorkspaceSize; ++ IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); ++ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm( ++ GetMiopenHandle(context), ++ s_.x_tensor, ++ s_.x_data, ++ s_.w_desc, ++ s_.w_data, ++ s_.conv_desc, ++ s_.y_tensor, ++ s_.y_data, ++ 1, // requestedAlgoCount ++ &algo_count, // returnedAlgoCount ++ &perf, ++ algo_search_workspace.get(), ++ max_ws_size, ++ false)); // Do not do exhaustive algo search. ++ s_.fwd_algo = perf.fwd_algo; ++ s_.workspace_bytes = perf.memory; ++ } + return Status::OK(); + } + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h +index bc9846203e57..d54218f25854 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.h ++++ b/onnxruntime/core/providers/rocm/nn/conv.h +@@ -108,9 +108,6 @@ class lru_unordered_map { + list_type lru_list_; + }; + +-// cached miopen descriptors +-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000; +- + template + struct MiopenConvState { + // if x/w dims changed, update algo and miopenTensors +@@ -148,9 +145,6 @@ struct MiopenConvState { + decltype(AlgoPerfType().memory) memory; + }; + +- lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; +- lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; +- + // Some properties needed to support asymmetric padded Conv nodes + bool post_slicing_required; + TensorShapeVector slice_starts; +diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +index 7447113fdf84..45ed4c8ac37a 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +@@ -76,7 +76,6 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); +- s_.cached_benchmark_bwd_results.clear(); + } + + ConvTransposeAttributes::Prepare p; +@@ -127,12 +126,13 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + + y_data = reinterpret_cast(p.Y->MutableData()); + +- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) { +- IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); +- +- miopenConvAlgoPerf_t perf; +- int algo_count = 1; +- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm( ++ } ++ // The following is required before calling convolution, we cannot cache the results ++ { ++ IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); ++ miopenConvAlgoPerf_t perf; ++ int algo_count = 1; ++ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm( + GetMiopenHandle(context), + s_.x_tensor, + x_data, +@@ -147,10 +147,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + algo_search_workspace.get(), + AlgoSearchWorkspaceSize, + false)); +- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory}); +- } + +- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims); + s_.bwd_data_algo = perf.bwd_data_algo; + s_.workspace_bytes = perf.memory; + }