diff --git a/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch b/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch deleted file mode 100644 index a8fa9df0e2..0000000000 --- a/machine-learning/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch +++ /dev/null @@ -1,25 +0,0 @@ -From e267bc9bab8b3873dba57323ddcd9a9d09a1211e Mon Sep 17 00:00:00 2001 -From: mertalev <101130780+mertalev@users.noreply.github.com> -Date: Fri, 20 Dec 2024 00:59:21 -0500 -Subject: [PATCH] fix: avoid race condition for rocm conv algo caching - ---- - onnxruntime/core/providers/rocm/nn/conv.cc | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc -index d7f47d07a8..ec438287ac 100644 ---- a/onnxruntime/core/providers/rocm/nn/conv.cc -+++ b/onnxruntime/core/providers/rocm/nn/conv.cc -@@ -278,6 +278,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) - HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); - } - -+ // lock is needed to avoid race condition during algo search -+ std::lock_guard lock(s_.mutex); - if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { - miopenConvAlgoPerf_t perf; - int algo_count = 1; --- -2.43.0 - diff --git a/machine-learning/0001-guard-algo-benchmark-results.patch b/machine-learning/0001-guard-algo-benchmark-results.patch new file mode 100644 index 0000000000..70c7fe18e0 --- /dev/null +++ b/machine-learning/0001-guard-algo-benchmark-results.patch @@ -0,0 +1,58 @@ +From 1f5d6323fa69ee16feab25f8e1398c1aed03ee08 Mon Sep 17 00:00:00 2001 +From: mertalev <101130780+mertalev@users.noreply.github.com> +Date: Sun, 29 Dec 2024 14:07:51 -0500 +Subject: [PATCH] guard algo benchmark results + +--- + onnxruntime/core/providers/rocm/nn/conv.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h +index bc9846203e..0086e064f1 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.h ++++ b/onnxruntime/core/providers/rocm/nn/conv.h +@@ -52,6 +52,7 @@ class lru_unordered_map { + lru_unordered_map(size_t max_size) : max_size_(max_size) {} + + void insert(const Key& key, const T& value) { ++ std::lock_guard guard(mutex_); + auto it = items_.find(key); + if (it != items_.end()) { + it->second.value = value; +@@ -69,6 +70,7 @@ class lru_unordered_map { + } + + T& at(const Key& key) { ++ std::lock_guard guard(mutex_); + auto it = items_.find(key); + if (it == items_.end()) { + throw std::out_of_range("There is no such key in cache"); +@@ -78,14 +80,17 @@ class lru_unordered_map { + } + + bool contains(const Key& key) const { ++ std::lock_guard guard(mutex_); + return items_.find(key) != items_.end(); + } + + size_t size() const { ++ std::lock_guard guard(mutex_); + return items_.size(); + } + + void clear() { ++ std::lock_guard guard(mutex_); + items_.clear(); + lru_list_.clear(); + } +@@ -106,6 +111,7 @@ class lru_unordered_map { + size_t max_size_; + std::unordered_map items_; + list_type lru_list_; ++ mutable std::mutex mutex_; + }; + + // cached miopen descriptors +-- +2.43.0 + diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 2644514015..381ae7055b 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -17,11 +17,11 @@ RUN mkdir /opt/armnn && \ # Warning: 26.3Gb of disk space required to pull this image # https://github.com/microsoft/onnxruntime/blob/main/dockerfiles/Dockerfile.rocm -FROM rocm/dev-ubuntu-24.04:6.2.4-complete AS builder-rocm +FROM rocm/dev-ubuntu-22.04:6.1.2-complete AS builder-rocm WORKDIR /code -RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.12-venv +RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv # Install same version as the Dockerfile provided by onnxruntime RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \ chmod +x cmake-3.27.3-linux-x86_64.sh && \ @@ -32,12 +32,11 @@ RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3. ENV PATH /code/cmake-3.27.3-linux-x86_64/bin:${PATH} # Prepare onnxruntime repository & build onnxruntime -# Note: cannot upgrade from 1.19.2 as of writing until upstream updates the ROCm CI RUN git clone --single-branch --branch v1.19.2 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime WORKDIR /code/onnxruntime # Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567 -COPY ./0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch /tmp/ -RUN git apply /tmp/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch +COPY ./0001-guard-algo-benchmark-results.patch /tmp/ +RUN git apply /tmp/0001-guard-algo-benchmark-results.patch RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh # Note: the `parallel` setting uses a substantial amount of RAM