mirror of
https://github.com/immich-app/immich.git
synced 2025-07-07 10:14:08 -04:00
try mutex for algo cache
use OrtMutex
This commit is contained in:
parent
fe26ccd1b7
commit
f30fac971a
@ -0,0 +1,25 @@
|
|||||||
|
From e267bc9bab8b3873dba57323ddcd9a9d09a1211e Mon Sep 17 00:00:00 2001
|
||||||
|
From: mertalev <101130780+mertalev@users.noreply.github.com>
|
||||||
|
Date: Fri, 20 Dec 2024 00:59:21 -0500
|
||||||
|
Subject: [PATCH] fix: avoid race condition for rocm conv algo caching
|
||||||
|
|
||||||
|
---
|
||||||
|
onnxruntime/core/providers/rocm/nn/conv.cc | 2 ++
|
||||||
|
1 file changed, 2 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
|
||||||
|
index d7f47d07a8..ec438287ac 100644
|
||||||
|
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
|
||||||
|
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
|
||||||
|
@@ -278,6 +278,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
||||||
|
HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
|
||||||
|
}
|
||||||
|
|
||||||
|
+ // lock is needed to avoid race condition during algo search
|
||||||
|
+ std::lock_guard<OrtMutex> lock(s_.mutex);
|
||||||
|
if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
|
||||||
|
miopenConvAlgoPerf_t perf;
|
||||||
|
int algo_count = 1;
|
||||||
|
--
|
||||||
|
2.43.0
|
||||||
|
|
@ -23,7 +23,7 @@ WORKDIR /code
|
|||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
|
RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
|
||||||
# Install same version as the Dockerfile provided by onnxruntime
|
# Install same version as the Dockerfile provided by onnxruntime
|
||||||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \
|
RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.sh && \
|
||||||
chmod +x cmake-3.27.3-linux-x86_64.sh && \
|
chmod +x cmake-3.27.3-linux-x86_64.sh && \
|
||||||
mkdir -p /code/cmake-3.27.3-linux-x86_64 && \
|
mkdir -p /code/cmake-3.27.3-linux-x86_64 && \
|
||||||
./cmake-3.27.3-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.27.3-linux-x86_64 && \
|
./cmake-3.27.3-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.27.3-linux-x86_64 && \
|
||||||
@ -37,8 +37,8 @@ WORKDIR /code/onnxruntime
|
|||||||
# EDIT PR
|
# EDIT PR
|
||||||
# While there's still this PR open, we need to compile on the branch of the PR
|
# While there's still this PR open, we need to compile on the branch of the PR
|
||||||
# https://github.com/microsoft/onnxruntime/pull/19567
|
# https://github.com/microsoft/onnxruntime/pull/19567
|
||||||
COPY ./rocm-PR19567.patch /tmp/
|
COPY ./0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch /tmp/
|
||||||
RUN git apply /tmp/rocm-PR19567.patch
|
RUN git apply /tmp/0001-fix-avoid-race-condition-for-rocm-conv-algo-caching.patch
|
||||||
# END EDIT PR
|
# END EDIT PR
|
||||||
RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
|
RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
|
||||||
# I ran into a compilation error when parallelizing the build
|
# I ran into a compilation error when parallelizing the build
|
||||||
@ -77,10 +77,10 @@ FROM prod-cpu AS prod-openvino
|
|||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \
|
apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \
|
||||||
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
|
wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
|
||||||
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
|
wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
|
||||||
wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
|
wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
|
||||||
wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
|
wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
|
||||||
dpkg -i *.deb && \
|
dpkg -i *.deb && \
|
||||||
rm *.deb && \
|
rm *.deb && \
|
||||||
apt-get remove wget -yqq && \
|
apt-get remove wget -yqq && \
|
||||||
|
@ -1,176 +0,0 @@
|
|||||||
From a598a88db258f82a6e4bca75810921bd6bcee7e0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: David Nieto <dmnieto@gmail.com>
|
|
||||||
Date: Sat, 17 Feb 2024 11:23:12 -0800
|
|
||||||
Subject: [PATCH] Disable algo caching in ROCM EP
|
|
||||||
|
|
||||||
Similar to the work done by Liangxijun-1001 in
|
|
||||||
https://github.com/apache/tvm/pull/16178 the ROCM spec mandates calling
|
|
||||||
miopenFindConvolution*Algorithm() before using any Convolution API
|
|
||||||
|
|
||||||
This is the link to the porting guide describing this requirement
|
|
||||||
https://rocmdocs.amd.com/projects/MIOpen/en/latest/MIOpen_Porting_Guide.html
|
|
||||||
|
|
||||||
Thus, this change disables the algo cache and enforces the official
|
|
||||||
API semantics
|
|
||||||
|
|
||||||
Signed-off-by: David Nieto <dmnieto@gmail.com>
|
|
||||||
---
|
|
||||||
onnxruntime/core/providers/rocm/nn/conv.cc | 61 +++++++++----------
|
|
||||||
onnxruntime/core/providers/rocm/nn/conv.h | 6 --
|
|
||||||
.../core/providers/rocm/nn/conv_transpose.cc | 17 +++---
|
|
||||||
3 files changed, 36 insertions(+), 48 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
|
|
||||||
index 6214ec7bc0ea..b08aceca48b1 100644
|
|
||||||
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
|
|
||||||
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
|
|
||||||
@@ -125,10 +125,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
|
||||||
if (input_dims_changed)
|
|
||||||
s_.last_x_dims = gsl::make_span(x_dims);
|
|
||||||
|
|
||||||
- if (w_dims_changed) {
|
|
||||||
+ if (w_dims_changed)
|
|
||||||
s_.last_w_dims = gsl::make_span(w_dims);
|
|
||||||
- s_.cached_benchmark_fwd_results.clear();
|
|
||||||
- }
|
|
||||||
|
|
||||||
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
|
|
||||||
|
|
||||||
@@ -277,35 +275,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
|
||||||
HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
|
|
||||||
HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
|
|
||||||
}
|
|
||||||
-
|
|
||||||
- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
|
|
||||||
- miopenConvAlgoPerf_t perf;
|
|
||||||
- int algo_count = 1;
|
|
||||||
- const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
|
|
||||||
- static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
|
|
||||||
- size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos)
|
|
||||||
- : AlgoSearchWorkspaceSize;
|
|
||||||
- IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
|
|
||||||
- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
|
|
||||||
- GetMiopenHandle(context),
|
|
||||||
- s_.x_tensor,
|
|
||||||
- s_.x_data,
|
|
||||||
- s_.w_desc,
|
|
||||||
- s_.w_data,
|
|
||||||
- s_.conv_desc,
|
|
||||||
- s_.y_tensor,
|
|
||||||
- s_.y_data,
|
|
||||||
- 1, // requestedAlgoCount
|
|
||||||
- &algo_count, // returnedAlgoCount
|
|
||||||
- &perf,
|
|
||||||
- algo_search_workspace.get(),
|
|
||||||
- max_ws_size,
|
|
||||||
- false)); // Do not do exhaustive algo search.
|
|
||||||
- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
|
|
||||||
- }
|
|
||||||
- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
|
|
||||||
- s_.fwd_algo = perf.fwd_algo;
|
|
||||||
- s_.workspace_bytes = perf.memory;
|
|
||||||
} else {
|
|
||||||
// set Y
|
|
||||||
s_.Y = context->Output(0, TensorShape(s_.y_dims));
|
|
||||||
@@ -319,6 +288,34 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
|
||||||
s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
+ {
|
|
||||||
+ /* FindConvolution must always be called by the runtime */
|
|
||||||
+ TensorShapeVector x_dims_miopen{x_dims.begin(), x_dims.end()};
|
|
||||||
+ miopenConvAlgoPerf_t perf;
|
|
||||||
+ int algo_count = 1;
|
|
||||||
+ const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
|
|
||||||
+ static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
|
|
||||||
+ size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos)
|
|
||||||
+ : AlgoSearchWorkspaceSize;
|
|
||||||
+ IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
|
|
||||||
+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
|
|
||||||
+ GetMiopenHandle(context),
|
|
||||||
+ s_.x_tensor,
|
|
||||||
+ s_.x_data,
|
|
||||||
+ s_.w_desc,
|
|
||||||
+ s_.w_data,
|
|
||||||
+ s_.conv_desc,
|
|
||||||
+ s_.y_tensor,
|
|
||||||
+ s_.y_data,
|
|
||||||
+ 1, // requestedAlgoCount
|
|
||||||
+ &algo_count, // returnedAlgoCount
|
|
||||||
+ &perf,
|
|
||||||
+ algo_search_workspace.get(),
|
|
||||||
+ max_ws_size,
|
|
||||||
+ false)); // Do not do exhaustive algo search.
|
|
||||||
+ s_.fwd_algo = perf.fwd_algo;
|
|
||||||
+ s_.workspace_bytes = perf.memory;
|
|
||||||
+ }
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
|
|
||||||
diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
|
|
||||||
index bc9846203e57..d54218f25854 100644
|
|
||||||
--- a/onnxruntime/core/providers/rocm/nn/conv.h
|
|
||||||
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
|
|
||||||
@@ -108,9 +108,6 @@ class lru_unordered_map {
|
|
||||||
list_type lru_list_;
|
|
||||||
};
|
|
||||||
|
|
||||||
-// cached miopen descriptors
|
|
||||||
-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
|
|
||||||
-
|
|
||||||
template <typename AlgoPerfType>
|
|
||||||
struct MiopenConvState {
|
|
||||||
// if x/w dims changed, update algo and miopenTensors
|
|
||||||
@@ -148,9 +145,6 @@ struct MiopenConvState {
|
|
||||||
decltype(AlgoPerfType().memory) memory;
|
|
||||||
};
|
|
||||||
|
|
||||||
- lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
|
|
||||||
- lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
|
|
||||||
-
|
|
||||||
// Some properties needed to support asymmetric padded Conv nodes
|
|
||||||
bool post_slicing_required;
|
|
||||||
TensorShapeVector slice_starts;
|
|
||||||
diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
|
|
||||||
index 7447113fdf84..45ed4c8ac37a 100644
|
|
||||||
--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
|
|
||||||
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
|
|
||||||
@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
|
|
||||||
|
|
||||||
if (w_dims_changed) {
|
|
||||||
s_.last_w_dims = gsl::make_span(w_dims);
|
|
||||||
- s_.cached_benchmark_bwd_results.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvTransposeAttributes::Prepare p;
|
|
||||||
@@ -127,12 +126,13 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
|
|
||||||
|
|
||||||
y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
|
|
||||||
|
|
||||||
- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
|
|
||||||
- IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
|
|
||||||
-
|
|
||||||
- miopenConvAlgoPerf_t perf;
|
|
||||||
- int algo_count = 1;
|
|
||||||
- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
|
|
||||||
+ }
|
|
||||||
+ // The following is required before calling convolution, we cannot cache the results
|
|
||||||
+ {
|
|
||||||
+ IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
|
|
||||||
+ miopenConvAlgoPerf_t perf;
|
|
||||||
+ int algo_count = 1;
|
|
||||||
+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
|
|
||||||
GetMiopenHandle(context),
|
|
||||||
s_.x_tensor,
|
|
||||||
x_data,
|
|
||||||
@@ -147,10 +147,7 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
|
|
||||||
algo_search_workspace.get(),
|
|
||||||
AlgoSearchWorkspaceSize,
|
|
||||||
false));
|
|
||||||
- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
|
|
||||||
- }
|
|
||||||
|
|
||||||
- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
|
|
||||||
s_.bwd_data_algo = perf.bwd_data_algo;
|
|
||||||
s_.workspace_bytes = perf.memory;
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user