From fe2ddc364435e8cbd463e55218393e6eff217787 Mon Sep 17 00:00:00 2001 From: mertalev <101130780+mertalev@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:02:35 -0500 Subject: [PATCH] use composite cache key 1.19.2 fix variable name fix variable reference aaaaaaaaaaaaaaaaaaaa --- .../0001-fix-rocm-conv-thread-safety.patch | 150 ++++++++++++++++++ .../0001-guard-algo-benchmark-results.patch | 58 ------- machine-learning/Dockerfile | 8 +- 3 files changed, 155 insertions(+), 61 deletions(-) create mode 100644 machine-learning/0001-fix-rocm-conv-thread-safety.patch delete mode 100644 machine-learning/0001-guard-algo-benchmark-results.patch diff --git a/machine-learning/0001-fix-rocm-conv-thread-safety.patch b/machine-learning/0001-fix-rocm-conv-thread-safety.patch new file mode 100644 index 0000000000..bd0e759f69 --- /dev/null +++ b/machine-learning/0001-fix-rocm-conv-thread-safety.patch @@ -0,0 +1,150 @@ +From 350e3237eadb738a0d96295a62f2eed96653c315 Mon Sep 17 00:00:00 2001 +From: mertalev <101130780+mertalev@users.noreply.github.com> +Date: Fri, 20 Dec 2024 00:59:21 -0500 +Subject: [PATCH 1/1] fix: avoid race condition for rocm conv algo caching + +--- + onnxruntime/core/providers/rocm/nn/conv.cc | 8 ++++---- + onnxruntime/core/providers/rocm/nn/conv.h | 14 ++++++++++++-- + .../core/providers/rocm/nn/conv_transpose.cc | 8 ++++---- + 3 files changed, 20 insertions(+), 10 deletions(-) + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc +index d7f47d07a8..98b6b69212 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv.cc +@@ -127,7 +127,6 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); +- s_.cached_benchmark_fwd_results.clear(); + } + + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); +@@ -278,7 +277,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + } + +- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { ++ const std::size_t algo_key = HashConvAlgoKey(x_dims_miopen, w_dims); ++ if (!s_.cached_benchmark_fwd_results.contains(algo_key)) { + miopenConvAlgoPerf_t perf; + int algo_count = 1; + const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); +@@ -301,9 +301,9 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + algo_search_workspace.get(), + max_ws_size, + false)); // Do not do exhaustive algo search. +- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory}); ++ s_.cached_benchmark_fwd_results.insert(algo_key, {perf.fwd_algo, perf.memory}); + } +- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen); ++ const auto& perf = s_.cached_benchmark_fwd_results.at(algo_key); + s_.fwd_algo = perf.fwd_algo; + s_.workspace_bytes = perf.memory; + } else { +diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h +index bc9846203e..b1ca5f8e4b 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.h ++++ b/onnxruntime/core/providers/rocm/nn/conv.h +@@ -43,6 +43,11 @@ struct vector_hash { + } + }; + ++inline std::size_t HashConvAlgoKey(const TensorShapeVector& x_dims, const TensorShapeVector& w_dims) { ++ vector_hash vh; ++ return vh(x_dims) ^ vh(w_dims); ++} ++ + template , + typename KeyEqual = std::equal_to, +@@ -52,6 +57,7 @@ class lru_unordered_map { + lru_unordered_map(size_t max_size) : max_size_(max_size) {} + + void insert(const Key& key, const T& value) { ++ std::lock_guard guard(mutex_); + auto it = items_.find(key); + if (it != items_.end()) { + it->second.value = value; +@@ -69,6 +75,7 @@ class lru_unordered_map { + } + + T& at(const Key& key) { ++ std::lock_guard guard(mutex_); + auto it = items_.find(key); + if (it == items_.end()) { + throw std::out_of_range("There is no such key in cache"); +@@ -78,6 +85,7 @@ class lru_unordered_map { + } + + bool contains(const Key& key) const { ++ std::lock_guard guard(mutex_); + return items_.find(key) != items_.end(); + } + +@@ -86,6 +94,7 @@ class lru_unordered_map { + } + + void clear() { ++ std::lock_guard guard(mutex_); + items_.clear(); + lru_list_.clear(); + } +@@ -106,6 +115,7 @@ class lru_unordered_map { + size_t max_size_; + std::unordered_map items_; + list_type lru_list_; ++ mutable std::mutex mutex_; + }; + + // cached miopen descriptors +@@ -148,8 +158,8 @@ struct MiopenConvState { + decltype(AlgoPerfType().memory) memory; + }; + +- lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; +- lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; ++ lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; ++ lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; + + // Some properties needed to support asymmetric padded Conv nodes + bool post_slicing_required; +diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +index 7447113fdf..dea9bf2a05 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +@@ -76,7 +76,6 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); +- s_.cached_benchmark_bwd_results.clear(); + } + + ConvTransposeAttributes::Prepare p; +@@ -127,7 +126,8 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + + y_data = reinterpret_cast(p.Y->MutableData()); + +- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) { ++ const std::size_t algo_key = HashConvAlgoKey(x_dims, w_dims); ++ if (!s_.cached_benchmark_bwd_results.contains(algo_key)) { + IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); + + miopenConvAlgoPerf_t perf; +@@ -147,10 +147,10 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + algo_search_workspace.get(), + AlgoSearchWorkspaceSize, + false)); +- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory}); ++ s_.cached_benchmark_bwd_results.insert(algo_key, {perf.bwd_data_algo, perf.memory}); + } + +- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims); ++ const auto& perf = s_.cached_benchmark_bwd_results.at(algo_key); + s_.bwd_data_algo = perf.bwd_data_algo; + s_.workspace_bytes = perf.memory; + } +-- +2.43.0 + diff --git a/machine-learning/0001-guard-algo-benchmark-results.patch b/machine-learning/0001-guard-algo-benchmark-results.patch deleted file mode 100644 index 70c7fe18e0..0000000000 --- a/machine-learning/0001-guard-algo-benchmark-results.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 1f5d6323fa69ee16feab25f8e1398c1aed03ee08 Mon Sep 17 00:00:00 2001 -From: mertalev <101130780+mertalev@users.noreply.github.com> -Date: Sun, 29 Dec 2024 14:07:51 -0500 -Subject: [PATCH] guard algo benchmark results - ---- - onnxruntime/core/providers/rocm/nn/conv.h | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h -index bc9846203e..0086e064f1 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv.h -+++ b/onnxruntime/core/providers/rocm/nn/conv.h -@@ -52,6 +52,7 @@ class lru_unordered_map { - lru_unordered_map(size_t max_size) : max_size_(max_size) {} - - void insert(const Key& key, const T& value) { -+ std::lock_guard guard(mutex_); - auto it = items_.find(key); - if (it != items_.end()) { - it->second.value = value; -@@ -69,6 +70,7 @@ class lru_unordered_map { - } - - T& at(const Key& key) { -+ std::lock_guard guard(mutex_); - auto it = items_.find(key); - if (it == items_.end()) { - throw std::out_of_range("There is no such key in cache"); -@@ -78,14 +80,17 @@ class lru_unordered_map { - } - - bool contains(const Key& key) const { -+ std::lock_guard guard(mutex_); - return items_.find(key) != items_.end(); - } - - size_t size() const { -+ std::lock_guard guard(mutex_); - return items_.size(); - } - - void clear() { -+ std::lock_guard guard(mutex_); - items_.clear(); - lru_list_.clear(); - } -@@ -106,6 +111,7 @@ class lru_unordered_map { - size_t max_size_; - std::unordered_map items_; - list_type lru_list_; -+ mutable std::mutex mutex_; - }; - - // cached miopen descriptors --- -2.43.0 - diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 381ae7055b..628801d3db 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -17,6 +17,7 @@ RUN mkdir /opt/armnn && \ # Warning: 26.3Gb of disk space required to pull this image # https://github.com/microsoft/onnxruntime/blob/main/dockerfiles/Dockerfile.rocm +# 6.2 or later fails to build as of writing FROM rocm/dev-ubuntu-22.04:6.1.2-complete AS builder-rocm WORKDIR /code @@ -32,11 +33,12 @@ RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3. ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH} # Prepare onnxruntime repository & build onnxruntime +# 1.20.1 fails to build as of writing RUN git clone --single-branch --branch v1.19.2 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime WORKDIR /code/onnxruntime # Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567 -COPY ./0001-guard-algo-benchmark-results.patch /tmp/ -RUN git apply /tmp/0001-guard-algo-benchmark-results.patch +COPY ./0001-fix-rocm-conv-thread-safety.patch /tmp/ +RUN git apply /tmp/0001-fix-rocm-conv-thread-safety.patch RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh # Note: the `parallel` setting uses a substantial amount of RAM @@ -112,7 +114,7 @@ COPY --from=builder-armnn \ /opt/ann/build.sh \ /opt/armnn/ -FROM rocm/dev-ubuntu-24.04:6.2.4-complete AS prod-rocm +FROM rocm/dev-ubuntu-22.04:6.1.2-complete AS prod-rocm FROM prod-${DEVICE} AS prod