diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 4007b07e2e..e7cf85392b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -49,23 +49,38 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - suffix: ["", "-cuda", "-openvino", "-armnn","-rknn"] + suffix: ['', '-cuda', '-rocm', '-openvino', '-armnn', '-rknn'] steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Re-tag image - run: | - REGISTRY_NAME="ghcr.io" - REPOSITORY=${{ github.repository_owner }}/immich-machine-learning - TAG_OLD=main${{ matrix.suffix }} - TAG_PR=${{ github.event.number == 0 && github.ref_name || format('pr-{0}', github.event.number) }}${{ matrix.suffix }} - TAG_COMMIT=commit-${{ github.event_name != 'pull_request' && github.sha || github.event.pull_request.head.sha }}${{ matrix.suffix }} - docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_PR $REGISTRY_NAME/$REPOSITORY:$TAG_OLD - docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_COMMIT $REGISTRY_NAME/$REPOSITORY:$TAG_OLD + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Re-tag image + run: | + REGISTRY_NAME="ghcr.io" + REPOSITORY=${{ github.repository_owner }}/immich-machine-learning + TAG_OLD=main${{ matrix.suffix }} + TAG_PR=${{ github.event.number == 0 && github.ref_name || format('pr-{0}', github.event.number) }}${{ matrix.suffix }} + TAG_COMMIT=commit-${{ github.event_name != 'pull_request' && github.sha || github.event.pull_request.head.sha }}${{ matrix.suffix }} + docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_PR $REGISTRY_NAME/$REPOSITORY:$TAG_OLD + docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_COMMIT $REGISTRY_NAME/$REPOSITORY:$TAG_OLD + - name: Login to GitHub Container Registry + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Re-tag image + run: | + REGISTRY_NAME="ghcr.io" + REPOSITORY=${{ github.repository_owner }}/immich-machine-learning + TAG_OLD=main${{ matrix.suffix }} + TAG_PR=${{ github.event.number == 0 && github.ref_name || format('pr-{0}', github.event.number) }}${{ matrix.suffix }} + TAG_COMMIT=commit-${{ github.event_name != 'pull_request' && github.sha || github.event.pull_request.head.sha }}${{ matrix.suffix }} + docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_PR $REGISTRY_NAME/$REPOSITORY:$TAG_OLD + docker buildx imagetools create -t $REGISTRY_NAME/$REPOSITORY:$TAG_COMMIT $REGISTRY_NAME/$REPOSITORY:$TAG_OLD retag_server: name: Re-Tag Server @@ -74,7 +89,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - suffix: [""] + suffix: [''] steps: - name: Login to GitHub Container Registry uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3 @@ -120,6 +135,11 @@ jobs: device: cuda suffix: -cuda + - platform: linux/amd64 + runner: mich + device: rocm + suffix: -rocm + - platform: linux/amd64 runner: ubuntu-latest device: openvino @@ -129,7 +149,7 @@ jobs: runner: ubuntu-24.04-arm device: armnn suffix: -armnn - + - platform: linux/arm64 runner: ubuntu-24.04-arm device: rknn @@ -220,6 +240,8 @@ jobs: - device: cpu - device: cuda suffix: -cuda + - device: rocm + suffix: -rocm - device: openvino suffix: -openvino - device: armnn @@ -257,7 +279,7 @@ jobs: id: meta uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5 env: - DOCKER_METADATA_PR_HEAD_SHA: "true" + DOCKER_METADATA_PR_HEAD_SHA: 'true' with: flavor: | # Disable latest tag @@ -411,7 +433,7 @@ jobs: id: meta uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5 env: - DOCKER_METADATA_PR_HEAD_SHA: "true" + DOCKER_METADATA_PR_HEAD_SHA: 'true' with: flavor: | # Disable latest tag diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 78254c7662..322c8ae8bc 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -95,12 +95,12 @@ services: image: immich-machine-learning-dev:latest # extends: # file: hwaccel.ml.yml - # service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference + # service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl, rknn] for accelerated inference build: context: ../machine-learning dockerfile: Dockerfile args: - - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference + - DEVICE=cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl, rknn] for accelerated inference ports: - 3003:3003 volumes: diff --git a/docker/docker-compose.prod.yml b/docker/docker-compose.prod.yml index adb00dfbed..c0fcb079ad 100644 --- a/docker/docker-compose.prod.yml +++ b/docker/docker-compose.prod.yml @@ -38,12 +38,12 @@ services: image: immich-machine-learning:latest # extends: # file: hwaccel.ml.yml - # service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference + # service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl, rknn] for accelerated inference build: context: ../machine-learning dockerfile: Dockerfile args: - - DEVICE=cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference + - DEVICE=cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl, rknn] for accelerated inference ports: - 3003:3003 volumes: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 6be3189b41..55db2a7cc5 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -33,12 +33,12 @@ services: immich-machine-learning: container_name: immich_machine_learning - # For hardware acceleration, add one of -[armnn, cuda, openvino, rknn] to the image tag. + # For hardware acceleration, add one of -[armnn, cuda, rocm, openvino, rknn] to the image tag. # Example tag: ${IMMICH_VERSION:-release}-cuda image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release} # extends: # uncomment this section for hardware acceleration - see https://immich.app/docs/features/ml-hardware-acceleration # file: hwaccel.ml.yml - # service: cpu # set to one of [armnn, cuda, openvino, openvino-wsl, rknn] for accelerated inference - use the `-wsl` version for WSL2 where applicable + # service: cpu # set to one of [armnn, cuda, rocm, openvino, openvino-wsl, rknn] for accelerated inference - use the `-wsl` version for WSL2 where applicable volumes: - model-cache:/cache env_file: diff --git a/docker/hwaccel.ml.yml b/docker/hwaccel.ml.yml index c7035047c7..111202d022 100644 --- a/docker/hwaccel.ml.yml +++ b/docker/hwaccel.ml.yml @@ -33,6 +33,13 @@ services: capabilities: - gpu + rocm: + group_add: + - video + devices: + - /dev/dri:/dev/dri + - /dev/kfd:/dev/kfd + openvino: device_cgroup_rules: - 'c 189:* rmw' diff --git a/docs/docs/features/ml-hardware-acceleration.md b/docs/docs/features/ml-hardware-acceleration.md index 31d9803c8f..7d001ca6e5 100644 --- a/docs/docs/features/ml-hardware-acceleration.md +++ b/docs/docs/features/ml-hardware-acceleration.md @@ -11,6 +11,7 @@ You do not need to redo any machine learning jobs after enabling hardware accele - ARM NN (Mali) - CUDA (NVIDIA GPUs with [compute capability](https://developer.nvidia.com/cuda-gpus) 5.2 or higher) +- ROCm (AMD GPUs) - OpenVINO (Intel GPUs such as Iris Xe and Arc) - RKNN (Rockchip) @@ -44,6 +45,12 @@ You do not need to redo any machine learning jobs after enabling hardware accele - The installed driver must be >= 535 (it must support CUDA 12.2). - On Linux (except for WSL2), you also need to have [NVIDIA Container Toolkit][nvct] installed. +#### ROCm + +- The GPU must be supported by ROCm. If it isn't officially supported, you can attempt to use the `HSA_OVERRIDE_GFX_VERSION` environmental variable: `HSA_OVERRIDE_GFX_VERSION=`. If this doesn't work, you might need to also set `HSA_USE_SVM=0`. +- The ROCm image is quite large and requires at least 35GiB of free disk space. However, pulling later updates to the service through Docker will generally only amount to a few hundred megabytes as the rest will be cached. +- This backend is new and may experience some issues. For example, GPU power consumption can be higher than usual after running inference, even if the machine learning service is idle. In this case, it will only go back to normal after being idle for 5 minutes (configurable with the [MACHINE_LEARNING_MODEL_TTL](/docs/install/environment-variables) setting). + #### OpenVINO - Integrated GPUs are more likely to experience issues than discrete GPUs, especially for older processors or servers with low RAM. @@ -64,12 +71,12 @@ You do not need to redo any machine learning jobs after enabling hardware accele 1. If you do not already have it, download the latest [`hwaccel.ml.yml`][hw-file] file and ensure it's in the same folder as the `docker-compose.yml`. 2. In the `docker-compose.yml` under `immich-machine-learning`, uncomment the `extends` section and change `cpu` to the appropriate backend. -3. Still in `immich-machine-learning`, add one of -[armnn, cuda, openvino] to the `image` section's tag at the end of the line. +3. Still in `immich-machine-learning`, add one of -[armnn, cuda, rocm, openvino] to the `image` section's tag at the end of the line. 4. Redeploy the `immich-machine-learning` container with these updated settings. ### Confirming Device Usage -You can confirm the device is being recognized and used by checking its utilization. There are many tools to display this, such as `nvtop` for NVIDIA or Intel and `intel_gpu_top` for Intel. +You can confirm the device is being recognized and used by checking its utilization. There are many tools to display this, such as `nvtop` for NVIDIA or Intel, `intel_gpu_top` for Intel, and `radeontop` for AMD. You can also check the logs of the `immich-machine-learning` container. When a Smart Search or Face Detection job begins, or when you search with text in Immich, you should either see a log for `Available ORT providers` containing the relevant provider (e.g. `CUDAExecutionProvider` in the case of CUDA), or a `Loaded ANN model` log entry without errors in the case of ARM NN. diff --git a/docs/docs/guides/remote-machine-learning.md b/docs/docs/guides/remote-machine-learning.md index 1abf7d4e54..d9b644f106 100644 --- a/docs/docs/guides/remote-machine-learning.md +++ b/docs/docs/guides/remote-machine-learning.md @@ -23,12 +23,12 @@ name: immich_remote_ml services: immich-machine-learning: container_name: immich_machine_learning - # For hardware acceleration, add one of -[armnn, cuda, openvino] to the image tag. + # For hardware acceleration, add one of -[armnn, cuda, rocm, openvino] to the image tag. # Example tag: ${IMMICH_VERSION:-release}-cuda image: ghcr.io/immich-app/immich-machine-learning:${IMMICH_VERSION:-release} # extends: # file: hwaccel.ml.yml - # service: # set to one of [armnn, cuda, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable + # service: # set to one of [armnn, cuda, rocm, openvino, openvino-wsl] for accelerated inference - use the `-wsl` version for WSL2 where applicable volumes: - model-cache:/cache restart: always diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 4129ffcb8f..a790999dc7 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -17,6 +17,34 @@ RUN mkdir /opt/armnn && \ FROM builder-cpu AS builder-rknn +# Warning: 25GiB+ disk space required to pull this image +# TODO: find a way to reduce the image size +FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS builder-rocm + +WORKDIR /code + +RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv +RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.sh && \ + chmod +x cmake-3.30.1-linux-x86_64.sh && \ + mkdir -p /code/cmake-3.30.1-linux-x86_64 && \ + ./cmake-3.30.1-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.30.1-linux-x86_64 && \ + rm cmake-3.30.1-linux-x86_64.sh + +ENV PATH=/code/cmake-3.30.1-linux-x86_64/bin:${PATH} + +RUN git clone --single-branch --branch v1.20.1 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime +WORKDIR /code/onnxruntime +# Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567 +# TODO: find a way to fix this without disabling algo caching +COPY ./patches/* /tmp/ +RUN git apply /tmp/*.patch + +RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh +# Note: the `parallel` setting uses a substantial amount of RAM +RUN ./build.sh --allow_running_as_root --config Release --build_wheel --update --build --parallel 17 --cmake_extra_defines\ + ONNXRUNTIME_VERSION=1.20.1 --skip_tests --use_rocm --rocm_home=/opt/rocm +RUN mv /code/onnxruntime/build/Linux/Release/dist/*.whl /opt/ + FROM builder-${DEVICE} AS builder ARG DEVICE @@ -32,6 +60,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ uv sync --frozen --extra ${DEVICE} --no-dev --no-editable --no-install-project --compile-bytecode --no-progress --active --link-mode copy +RUN if [ "$DEVICE" = "rocm" ]; then \ + uv pip install /opt/onnxruntime_rocm-*.whl; \ + fi FROM python:3.11-slim-bookworm@sha256:614c8691ab74150465ec9123378cd4dde7a6e57be9e558c3108df40664667a4c AS prod-cpu @@ -39,10 +70,10 @@ FROM prod-cpu AS prod-openvino RUN apt-get update && \ apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \ - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \ - wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \ - wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \ + wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \ + wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \ + wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \ + wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \ dpkg -i *.deb && \ rm *.deb && \ apt-get remove wget -yqq && \ @@ -59,6 +90,8 @@ COPY --from=builder-cuda /usr/local/bin/python3 /usr/local/bin/python3 COPY --from=builder-cuda /usr/local/lib/python3.11 /usr/local/lib/python3.11 COPY --from=builder-cuda /usr/local/lib/libpython3.11.so /usr/local/lib/libpython3.11.so +FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS prod-rocm + FROM prod-cpu AS prod-armnn ENV LD_LIBRARY_PATH=/opt/armnn @@ -81,13 +114,12 @@ COPY --from=builder-armnn \ FROM prod-cpu AS prod-rknn -ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/v2.3.0/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so /usr/lib/ - FROM prod-${DEVICE} AS prod + ARG DEVICE RUN apt-get update && \ - apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ]; then echo "libmimalloc2.0"; fi) && \ + apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then echo "libmimalloc2.0"; fi) && \ apt-get autoremove -yqq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/machine-learning/README.md b/machine-learning/README.md index 4146a6f9de..3d1f09d70b 100644 --- a/machine-learning/README.md +++ b/machine-learning/README.md @@ -7,7 +7,7 @@ This project uses [uv](https://docs.astral.sh/uv/getting-started/installation/), so be sure to install it first. Running `uv sync --extra cpu` will install everything you need in an isolated virtual environment. -CUDA and OpenVINO are supported as acceleration APIs. To use them, you can replace `--group cpu` with either of `--group cuda` or `--group openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required. +CUDA, ROCM and OpenVINO are supported as acceleration APIs. To use them, you can replace `--extra cpu` with either of `--extra cuda`, `--extra rocm` or `--extra openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required. To add or remove dependencies, you can use the commands `uv add $PACKAGE_NAME` and `uv remove $PACKAGE_NAME`, respectively. Be sure to commit the `uv.lock` and `pyproject.toml` files with `uv lock` to reflect any changes in dependencies. diff --git a/machine-learning/app/models/constants.py b/machine-learning/app/models/constants.py index d04ed4d543..79020462a1 100644 --- a/machine-learning/app/models/constants.py +++ b/machine-learning/app/models/constants.py @@ -75,7 +75,12 @@ _INSIGHTFACE_MODELS = { } -SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"] +SUPPORTED_PROVIDERS = [ + "CUDAExecutionProvider", + "ROCMExecutionProvider", + "OpenVINOExecutionProvider", + "CPUExecutionProvider", +] RKNN_SUPPORTED_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"] RKNN_COREMASK_SUPPORTED_SOCS = ["rk3576", "rk3588"] diff --git a/machine-learning/app/sessions/ort.py b/machine-learning/app/sessions/ort.py index 00c7ad50a9..d15f2d3546 100644 --- a/machine-learning/app/sessions/ort.py +++ b/machine-learning/app/sessions/ort.py @@ -88,7 +88,7 @@ class OrtSession: match provider: case "CPUExecutionProvider": options = {"arena_extend_strategy": "kSameAsRequested"} - case "CUDAExecutionProvider": + case "CUDAExecutionProvider" | "ROCMExecutionProvider": options = {"arena_extend_strategy": "kSameAsRequested", "device_id": settings.device_id} case "OpenVINOExecutionProvider": options = { diff --git a/machine-learning/app/test_main.py b/machine-learning/app/test_main.py index 61424ea732..b8eea233d7 100644 --- a/machine-learning/app/test_main.py +++ b/machine-learning/app/test_main.py @@ -180,6 +180,7 @@ class TestOrtSession: OV_EP = ["OpenVINOExecutionProvider", "CPUExecutionProvider"] CUDA_EP_OUT_OF_ORDER = ["CPUExecutionProvider", "CUDAExecutionProvider"] TRT_EP = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] + ROCM_EP = ["ROCMExecutionProvider", "CPUExecutionProvider"] @pytest.mark.providers(CPU_EP) def test_sets_cpu_provider(self, providers: list[str]) -> None: @@ -219,6 +220,12 @@ class TestOrtSession: assert session.providers == self.CUDA_EP + @pytest.mark.providers(ROCM_EP) + def test_uses_rocm(self, providers: list[str]) -> None: + session = OrtSession("ViT-B-32__openai") + + assert session.providers == self.ROCM_EP + def test_sets_provider_kwarg(self) -> None: providers = ["CUDAExecutionProvider"] session = OrtSession("ViT-B-32__openai", providers=providers) @@ -235,19 +242,33 @@ class TestOrtSession: {"arena_extend_strategy": "kSameAsRequested"}, ] - def test_sets_device_id_for_openvino(self) -> None: + def test_sets_provider_options_for_openvino(self) -> None: + model_path = "/cache/ViT-B-32__openai/textual/model.onnx" os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1" - session = OrtSession("ViT-B-32__openai", providers=["OpenVINOExecutionProvider"]) + session = OrtSession(model_path, providers=["OpenVINOExecutionProvider"]) - assert session.provider_options[0]["device_type"] == "GPU.1" + assert session.provider_options == [ + { + "device_type": "GPU.1", + "precision": "FP32", + "cache_dir": "/cache/ViT-B-32__openai/textual/openvino", + } + ] - def test_sets_device_id_for_cuda(self) -> None: + def test_sets_provider_options_for_cuda(self) -> None: os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1" session = OrtSession("ViT-B-32__openai", providers=["CUDAExecutionProvider"]) - assert session.provider_options[0]["device_id"] == "1" + assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}] + + def test_sets_provider_options_for_rocm(self) -> None: + os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1" + + session = OrtSession("ViT-B-32__openai", providers=["ROCMExecutionProvider"]) + + assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}] def test_sets_provider_options_kwarg(self) -> None: session = OrtSession( diff --git a/machine-learning/patches/0001-disable-rocm-conv-algo-caching.patch b/machine-learning/patches/0001-disable-rocm-conv-algo-caching.patch new file mode 100644 index 0000000000..6627f67778 --- /dev/null +++ b/machine-learning/patches/0001-disable-rocm-conv-algo-caching.patch @@ -0,0 +1,179 @@ +commit 16839b58d9b3c3162a67ce5d776b36d4d24e801f +Author: mertalev <101130780+mertalev@users.noreply.github.com> +Date: Wed Mar 5 11:25:38 2025 -0500 + + disable algo caching (attributed to @dmnieto in https://github.com/microsoft/onnxruntime/pull/19567) + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc +index d7f47d07a8..4060a2af52 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv.cc +@@ -127,7 +127,6 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); +- s_.cached_benchmark_fwd_results.clear(); + } + + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); +@@ -277,35 +276,6 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size)); + HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + } +- +- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) { +- miopenConvAlgoPerf_t perf; +- int algo_count = 1; +- const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); +- static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT; +- size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId()) +- : AlgoSearchWorkspaceSize; +- IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); +- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm( +- GetMiopenHandle(context), +- s_.x_tensor, +- s_.x_data, +- s_.w_desc, +- s_.w_data, +- s_.conv_desc, +- s_.y_tensor, +- s_.y_data, +- 1, // requestedAlgoCount +- &algo_count, // returnedAlgoCount +- &perf, +- algo_search_workspace.get(), +- max_ws_size, +- false)); // Do not do exhaustive algo search. +- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory}); +- } +- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen); +- s_.fwd_algo = perf.fwd_algo; +- s_.workspace_bytes = perf.memory; + } else { + // set Y + s_.Y = context->Output(0, TensorShape(s_.y_dims)); +@@ -319,6 +289,31 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) + s_.y_data = reinterpret_cast(s_.Y->MutableData()); + } + } ++ ++ miopenConvAlgoPerf_t perf; ++ int algo_count = 1; ++ const ROCMExecutionProvider* rocm_ep = static_cast(this->Info().GetExecutionProvider()); ++ static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT; ++ size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId()) ++ : AlgoSearchWorkspaceSize; ++ IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); ++ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm( ++ GetMiopenHandle(context), ++ s_.x_tensor, ++ s_.x_data, ++ s_.w_desc, ++ s_.w_data, ++ s_.conv_desc, ++ s_.y_tensor, ++ s_.y_data, ++ 1, // requestedAlgoCount ++ &algo_count, // returnedAlgoCount ++ &perf, ++ algo_search_workspace.get(), ++ max_ws_size, ++ false)); // Do not do exhaustive algo search. ++ s_.fwd_algo = perf.fwd_algo; ++ s_.workspace_bytes = perf.memory; + return Status::OK(); + } + +diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h +index bc9846203e..d54218f258 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv.h ++++ b/onnxruntime/core/providers/rocm/nn/conv.h +@@ -108,9 +108,6 @@ class lru_unordered_map { + list_type lru_list_; + }; + +-// cached miopen descriptors +-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000; +- + template + struct MiopenConvState { + // if x/w dims changed, update algo and miopenTensors +@@ -148,9 +145,6 @@ struct MiopenConvState { + decltype(AlgoPerfType().memory) memory; + }; + +- lru_unordered_map cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; +- lru_unordered_map cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS}; +- + // Some properties needed to support asymmetric padded Conv nodes + bool post_slicing_required; + TensorShapeVector slice_starts; +diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +index 7447113fdf..a662e35b2e 100644 +--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc ++++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +@@ -76,7 +76,6 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); +- s_.cached_benchmark_bwd_results.clear(); + } + + ConvTransposeAttributes::Prepare p; +@@ -126,35 +125,29 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy + } + + y_data = reinterpret_cast(p.Y->MutableData()); +- +- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) { +- IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); +- +- miopenConvAlgoPerf_t perf; +- int algo_count = 1; +- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm( +- GetMiopenHandle(context), +- s_.x_tensor, +- x_data, +- s_.w_desc, +- w_data, +- s_.conv_desc, +- s_.y_tensor, +- y_data, +- 1, +- &algo_count, +- &perf, +- algo_search_workspace.get(), +- AlgoSearchWorkspaceSize, +- false)); +- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory}); +- } +- +- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims); +- s_.bwd_data_algo = perf.bwd_data_algo; +- s_.workspace_bytes = perf.memory; + } + ++ IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); ++ miopenConvAlgoPerf_t perf; ++ int algo_count = 1; ++ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm( ++ GetMiopenHandle(context), ++ s_.x_tensor, ++ x_data, ++ s_.w_desc, ++ w_data, ++ s_.conv_desc, ++ s_.y_tensor, ++ y_data, ++ 1, ++ &algo_count, ++ &perf, ++ algo_search_workspace.get(), ++ AlgoSearchWorkspaceSize, ++ false)); ++ s_.bwd_data_algo = perf.bwd_data_algo; ++ s_.workspace_bytes = perf.memory; ++ + // The following block will be executed in case there has been no change in the shapes of the + // input and the filter compared to the previous run + if (!y_data) { diff --git a/machine-learning/patches/0002-target-gfx900-gfx1102.patch b/machine-learning/patches/0002-target-gfx900-gfx1102.patch new file mode 100644 index 0000000000..fab7a62d8e --- /dev/null +++ b/machine-learning/patches/0002-target-gfx900-gfx1102.patch @@ -0,0 +1,13 @@ +diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt +index d90a2a355..bb1a7de12 100644 +--- a/cmake/CMakeLists.txt ++++ b/cmake/CMakeLists.txt +@@ -295,7 +295,7 @@ if (onnxruntime_USE_ROCM) + endif() + + if (NOT CMAKE_HIP_ARCHITECTURES) +- set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201") ++ set(CMAKE_HIP_ARCHITECTURES "gfx900;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx940;gfx941;gfx942;gfx1200;gfx1201") + endif() + + file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*) diff --git a/machine-learning/pyproject.toml b/machine-learning/pyproject.toml index 140f727de3..a68cd993ba 100644 --- a/machine-learning/pyproject.toml +++ b/machine-learning/pyproject.toml @@ -52,6 +52,7 @@ cuda = ["onnxruntime-gpu>=1.17.0,<2"] openvino = ["onnxruntime-openvino>=1.17.1,<1.19.0"] armnn = ["onnxruntime>=1.15.0,<2"] rknn = ["onnxruntime>=1.15.0,<2", "rknn-toolkit-lite2>=2.3.0,<3"] +rocm = [] [tool.uv] compile-bytecode = true diff --git a/machine-learning/start.sh b/machine-learning/start.sh index d2f5b94dc3..859183851c 100755 --- a/machine-learning/start.sh +++ b/machine-learning/start.sh @@ -2,16 +2,19 @@ echo "Initializing Immich ML $IMMICH_SOURCE_REF" -lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2" -# mimalloc seems to increase memory usage dramatically with openvino, need to investigate if ! [ "$DEVICE" = "openvino" ]; then - export LD_PRELOAD="$lib_path" - export LD_BIND_NOW=1 : "${MACHINE_LEARNING_WORKER_TIMEOUT:=120}" else : "${MACHINE_LEARNING_WORKER_TIMEOUT:=300}" fi +# mimalloc seems to increase memory usage dramatically with openvino, need to investigate +if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then + lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2" + export LD_PRELOAD="$lib_path" + export LD_BIND_NOW=1 +fi + : "${IMMICH_HOST:=[::]}" : "${IMMICH_PORT:=3003}" : "${MACHINE_LEARNING_WORKERS:=1}" diff --git a/machine-learning/uv.lock b/machine-learning/uv.lock index 32ac09c7c6..5d7bc31b45 100644 --- a/machine-learning/uv.lock +++ b/machine-learning/uv.lock @@ -1180,7 +1180,7 @@ requires-dist = [ { name = "tokenizers", specifier = ">=0.15.0,<1.0" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.22.0,<1.0" }, ] -provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn"] +provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn", "rocm"] [package.metadata.requires-dev] dev = [