From f19cf206baae585aa0b65f370caf1d19a72efdf9 Mon Sep 17 00:00:00 2001
From: mertalev <101130780+mertalev@users.noreply.github.com>
Date: Wed, 5 Mar 2025 09:36:27 -0500
Subject: [PATCH] disable algo caching

---
 .../0001-fix-rocm-conv-thread-safety.patch    | 150 ---------------
 machine-learning/Dockerfile                   |  13 +-
 machine-learning/rocm-PR19567.patch           | 176 ++++++++++++++++++
 3 files changed, 181 insertions(+), 158 deletions(-)
 delete mode 100644 machine-learning/0001-fix-rocm-conv-thread-safety.patch
 create mode 100644 machine-learning/rocm-PR19567.patch
diff --git a/machine-learning/0001-fix-rocm-conv-thread-safety.patch b/machine-learning/0001-fix-rocm-conv-thread-safety.patch
deleted file mode 100644
index bd0e759f69..0000000000
--- a/machine-learning/0001-fix-rocm-conv-thread-safety.patch
+++ /dev/null
@@ -1,150 +0,0 @@
-From 350e3237eadb738a0d96295a62f2eed96653c315 Mon Sep 17 00:00:00 2001
-From: mertalev <101130780+mertalev@users.noreply.github.com>
-Date: Fri, 20 Dec 2024 00:59:21 -0500
-Subject: [PATCH 1/1] fix: avoid race condition for rocm conv algo caching
-
----
- onnxruntime/core/providers/rocm/nn/conv.cc         |  8 ++++----
- onnxruntime/core/providers/rocm/nn/conv.h          | 14 ++++++++++++--
- .../core/providers/rocm/nn/conv_transpose.cc       |  8 ++++----
- 3 files changed, 20 insertions(+), 10 deletions(-)
-
-diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
-index d7f47d07a8..98b6b69212 100644
---- a/onnxruntime/core/providers/rocm/nn/conv.cc
-+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
-@@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
- 
-     if (w_dims_changed) {
-       s_.last_w_dims = gsl::make_span(w_dims);
--      s_.cached_benchmark_fwd_results.clear();
-     }
- 
-     ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
-@@ -278,7 +277,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
-       HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
-     }
- 
--    if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
-+    const std::size_t algo_key = HashConvAlgoKey(x_dims_miopen, w_dims);
-+    if (!s_.cached_benchmark_fwd_results.contains(algo_key)) {
-       miopenConvAlgoPerf_t perf;
-       int algo_count = 1;
-       const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
-@@ -301,9 +301,9 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
-           algo_search_workspace.get(),
-           max_ws_size,
-           false));  // Do not do exhaustive algo search.
--      s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
-+      s_.cached_benchmark_fwd_results.insert(algo_key, {perf.fwd_algo, perf.memory});
-     }
--    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
-+    const auto& perf = s_.cached_benchmark_fwd_results.at(algo_key);
-     s_.fwd_algo = perf.fwd_algo;
-     s_.workspace_bytes = perf.memory;
-   } else {
-diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
-index bc9846203e..b1ca5f8e4b 100644
---- a/onnxruntime/core/providers/rocm/nn/conv.h
-+++ b/onnxruntime/core/providers/rocm/nn/conv.h
-@@ -43,6 +43,11 @@ struct vector_hash {
-   }
- };
- 
-+inline std::size_t HashConvAlgoKey(const TensorShapeVector& x_dims, const TensorShapeVector& w_dims) {
-+  vector_hash vh;
-+  return vh(x_dims) ^ vh(w_dims);
-+}
-+
- template <typename Key, typename T,
-           typename Hash = std::hash<Key>,
-           typename KeyEqual = std::equal_to<Key>,
-@@ -52,6 +57,7 @@ class lru_unordered_map {
-   lru_unordered_map(size_t max_size) : max_size_(max_size) {}
- 
-   void insert(const Key& key, const T& value) {
-+    std::lock_guard<std::mutex> guard(mutex_);
-     auto it = items_.find(key);
-     if (it != items_.end()) {
-       it->second.value = value;
-@@ -69,6 +75,7 @@ class lru_unordered_map {
-   }
- 
-   T& at(const Key& key) {
-+    std::lock_guard<std::mutex> guard(mutex_);
-     auto it = items_.find(key);
-     if (it == items_.end()) {
-       throw std::out_of_range("There is no such key in cache");
-@@ -78,6 +85,7 @@ class lru_unordered_map {
-   }
- 
-   bool contains(const Key& key) const {
-+    std::lock_guard<std::mutex> guard(mutex_);
-     return items_.find(key) != items_.end();
-   }
- 
-@@ -86,6 +94,7 @@ class lru_unordered_map {
-   }
- 
-   void clear() {
-+    std::lock_guard<std::mutex> guard(mutex_);
-     items_.clear();
-     lru_list_.clear();
-   }
-@@ -106,6 +115,7 @@ class lru_unordered_map {
-   size_t max_size_;
-   std::unordered_map<Key, value_type, Hash, KeyEqual, MapAllocator> items_;
-   list_type lru_list_;
-+  mutable std::mutex mutex_;
- };
- 
- // cached miopen descriptors
-@@ -148,8 +158,8 @@ struct MiopenConvState {
-     decltype(AlgoPerfType().memory) memory;
-   };
- 
--  lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
--  lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
-+  lru_unordered_map<std::size_t, PerfFwdResultParams> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
-+  lru_unordered_map<std::size_t, PerfBwdResultParams> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
- 
-   // Some properties needed to support asymmetric padded Conv nodes
-   bool post_slicing_required;
-diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
-index 7447113fdf..dea9bf2a05 100644
---- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
-+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
-@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
- 
-       if (w_dims_changed) {
-         s_.last_w_dims = gsl::make_span(w_dims);
--        s_.cached_benchmark_bwd_results.clear();
-       }
- 
-       ConvTransposeAttributes::Prepare p;
-@@ -127,7 +126,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
- 
-       y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
- 
--      if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
-+      const std::size_t algo_key = HashConvAlgoKey(x_dims, w_dims);
-+      if (!s_.cached_benchmark_bwd_results.contains(algo_key)) {
-         IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
- 
-         miopenConvAlgoPerf_t perf;
-@@ -147,10 +147,10 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
-             algo_search_workspace.get(),
-             AlgoSearchWorkspaceSize,
-             false));
--        s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
-+        s_.cached_benchmark_bwd_results.insert(algo_key, {perf.bwd_data_algo, perf.memory});
-       }
- 
--      const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
-+      const auto& perf = s_.cached_benchmark_bwd_results.at(algo_key);
-       s_.bwd_data_algo = perf.bwd_data_algo;
-       s_.workspace_bytes = perf.memory;
-     }
--- 
-2.43.0
-
diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile
index 88bedd8e49..2a82715e3a 100644
--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@@ -15,15 +15,13 @@ RUN mkdir /opt/armnn && \
     cd /opt/ann && \
     sh build.sh
 
-# Warning: 26.3Gb of disk space required to pull this image
-# https://github.com/microsoft/onnxruntime/blob/main/dockerfiles/Dockerfile.rocm
-# 6.2 or later fails to build as of writing
+# Warning: 25GiB+ disk space required to pull this image
+# TODO: find a way to reduce the image size
 FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS builder-rocm
 
 WORKDIR /code
 
 RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
-# Install same version as the Dockerfile provided by onnxruntime
 RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.sh && \
     chmod +x cmake-3.30.1-linux-x86_64.sh && \
     mkdir -p /code/cmake-3.30.1-linux-x86_64 && \
@@ -32,13 +30,12 @@ RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.
 
 ENV PATH=/code/cmake-3.30.1-linux-x86_64/bin:${PATH}
 
-# Prepare onnxruntime repository & build onnxruntime
-# 1.20.1 fails to build as of writing
 RUN git clone --single-branch --branch v1.20.1 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime
 WORKDIR /code/onnxruntime
 # Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567
-COPY ./0001-fix-rocm-conv-thread-safety.patch /tmp/
-RUN git apply /tmp/0001-fix-rocm-conv-thread-safety.patch
+# TODO: find a way to fix this without disabling algo caching
+COPY ./rocm-PR19567.patch /tmp/
+RUN git apply /tmp/rocm-PR19567.patch
 
 RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
 # Note: the `parallel` setting uses a substantial amount of RAM
diff --git a/machine-learning/rocm-PR19567.patch b/machine-learning/rocm-PR19567.patch
new file mode 100644
index 0000000000..04ad8412eb
--- /dev/null
+++ b/machine-learning/rocm-PR19567.patch
@@ -0,0 +1,176 @@
+From a598a88db258f82a6e4bca75810921bd6bcee7e0 Mon Sep 17 00:00:00 2001
+From: David Nieto <dmnieto@gmail.com>
+Date: Sat, 17 Feb 2024 11:23:12 -0800
+Subject: [PATCH] Disable algo caching in ROCM EP
+
+Similar to the work done by Liangxijun-1001 in
+https://github.com/apache/tvm/pull/16178 the ROCM spec mandates calling
+miopenFindConvolution*Algorithm() before using any Convolution API
+
+This is the link to the porting guide describing this requirement
+https://rocmdocs.amd.com/projects/MIOpen/en/latest/MIOpen_Porting_Guide.html
+
+Thus, this change disables the algo cache and enforces the official
+API semantics
+
+Signed-off-by: David Nieto <dmnieto@gmail.com>
+---
+ onnxruntime/core/providers/rocm/nn/conv.cc    | 61 +++++++++----------
+ onnxruntime/core/providers/rocm/nn/conv.h     |  6 --
+ .../core/providers/rocm/nn/conv_transpose.cc  | 17 +++---
+ 3 files changed, 36 insertions(+), 48 deletions(-)
+
+diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
+index 6214ec7bc0ea..b08aceca48b1 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv.cc
++++ b/onnxruntime/core/providers/rocm/nn/conv.cc
+@@ -125,10 +125,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+     if (input_dims_changed)
+       s_.last_x_dims = gsl::make_span(x_dims);
+ 
+-    if (w_dims_changed) {
++    if (w_dims_changed)
+       s_.last_w_dims = gsl::make_span(w_dims);
+-      s_.cached_benchmark_fwd_results.clear();
+-    }
+ 
+     ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
+ 
+@@ -277,35 +275,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+       HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
+       HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
+     }
+-
+-    if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
+-      miopenConvAlgoPerf_t perf;
+-      int algo_count = 1;
+-      const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
+-      static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
+-      size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos)
+-                                                                   : AlgoSearchWorkspaceSize;
+-      IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
+-      MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
+-          GetMiopenHandle(context),
+-          s_.x_tensor,
+-          s_.x_data,
+-          s_.w_desc,
+-          s_.w_data,
+-          s_.conv_desc,
+-          s_.y_tensor,
+-          s_.y_data,
+-          1,            // requestedAlgoCount
+-          &algo_count,  // returnedAlgoCount
+-          &perf,
+-          algo_search_workspace.get(),
+-          max_ws_size,
+-          false));  // Do not do exhaustive algo search.
+-      s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
+-    }
+-    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
+-    s_.fwd_algo = perf.fwd_algo;
+-    s_.workspace_bytes = perf.memory;
+   } else {
+     // set Y
+     s_.Y = context->Output(0, TensorShape(s_.y_dims));
+@@ -319,6 +288,34 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+       s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
+     }
+   }
++  {
++    /* FindConvolution must always be called by the runtime */
++    TensorShapeVector x_dims_miopen{x_dims.begin(), x_dims.end()};
++    miopenConvAlgoPerf_t perf;
++    int algo_count = 1;
++    const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
++    static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
++    size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos)
++                                                                   : AlgoSearchWorkspaceSize;
++    IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
++    MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
++        GetMiopenHandle(context),
++        s_.x_tensor,
++        s_.x_data,
++        s_.w_desc,
++        s_.w_data,
++        s_.conv_desc,
++        s_.y_tensor,
++        s_.y_data,
++        1,            // requestedAlgoCount
++        &algo_count,  // returnedAlgoCount
++        &perf,
++        algo_search_workspace.get(),
++        max_ws_size,
++        false));  // Do not do exhaustive algo search.
++    s_.fwd_algo = perf.fwd_algo;
++    s_.workspace_bytes = perf.memory;
++  }
+   return Status::OK();
+ }
+ 
+diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
+index bc9846203e57..d54218f25854 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv.h
++++ b/onnxruntime/core/providers/rocm/nn/conv.h
+@@ -108,9 +108,6 @@ class lru_unordered_map {
+   list_type lru_list_;
+ };
+ 
+-// cached miopen descriptors
+-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
+-
+ template <typename AlgoPerfType>
+ struct MiopenConvState {
+   // if x/w dims changed, update algo and miopenTensors
+@@ -148,9 +145,6 @@ struct MiopenConvState {
+     decltype(AlgoPerfType().memory) memory;
+   };
+ 
+-  lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+-  lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+-
+   // Some properties needed to support asymmetric padded Conv nodes
+   bool post_slicing_required;
+   TensorShapeVector slice_starts;
+diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+index 7447113fdf84..45ed4c8ac37a 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
++++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+ 
+       if (w_dims_changed) {
+         s_.last_w_dims = gsl::make_span(w_dims);
+-        s_.cached_benchmark_bwd_results.clear();
+       }
+ 
+       ConvTransposeAttributes::Prepare p;
+@@ -127,12 +126,13 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+ 
+       y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
+ 
+-      if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
+-        IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+-
+-        miopenConvAlgoPerf_t perf;
+-        int algo_count = 1;
+-        MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
++    }
++    // The following is required before calling convolution, we cannot cache the results
++    {
++      IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
++      miopenConvAlgoPerf_t perf;
++      int algo_count = 1;
++      MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
+             GetMiopenHandle(context),
+             s_.x_tensor,
+             x_data,
+@@ -147,10 +147,7 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+             algo_search_workspace.get(),
+             AlgoSearchWorkspaceSize,
+             false));
+-        s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
+-      }
+ 
+-      const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
+       s_.bwd_data_algo = perf.bwd_data_algo;
+       s_.workspace_bytes = perf.memory;
+     }