feat(ml): rocm (#16613)

* feat(ml): introduce support of onnxruntime-rocm for AMD GPU * try mutex for algo cache use OrtMutex * bump versions, run on mich use 3.12 use 1.19.2 * acquire lock before any changes can be made guard algo benchmark results mark mutex as mutable re-add /bin/sh (?) use 3.10 use 6.1.2 * use composite cache key 1.19.2 fix variable name fix variable reference aaaaaaaaaaaaaaaaaaaa * bump deps * disable algo caching * fix gha * try ubuntu runner * actually fix the gha * update patch * skip mimalloc preload for rocm * increase build threads * increase timeout for rocm * Revert "increase timeout for rocm" This reverts commit 2c4452f5d132198ed381a7b262b4a5cab5114b5f. * attempt migraphx * set migraphx_home * Revert "set migraphx_home" This reverts commit c121d3e48754b3bce100636f8d666deec58a44b7. * Revert "attempt migraphx" This reverts commit 521f9fb72dbe506dc6cb8faeb6494817d87265c6. * migraphx, take two * bump rocm * allow cpu * try only targeting migraphx * skip tests * migraph ❌ * known issues * target gfx900 and gfx1102 * mention `HSA_USE_SVM` * update lock * set device id for rocm --------- Co-authored-by: Mehdi GHESH <mehdi.ghesh@hotmail.fr>
2026-02-04 08:49:01 +03:00 · 2025-03-17 17:08:19 -04:00
parent 6a40aa83b7
commit 2b37caba03
17 changed files with 340 additions and 50 deletions
--- a/machine-learning/Dockerfile
+++ b/machine-learning/Dockerfile
@@ -17,6 +17,34 @@ RUN mkdir /opt/armnn && \

 FROM builder-cpu AS builder-rknn

+# Warning: 25GiB+ disk space required to pull this image
+# TODO: find a way to reduce the image size
+FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS builder-rocm
+
+WORKDIR /code
+
+RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
+RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.sh && \
+    chmod +x cmake-3.30.1-linux-x86_64.sh && \
+    mkdir -p /code/cmake-3.30.1-linux-x86_64 && \
+    ./cmake-3.30.1-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.30.1-linux-x86_64 && \
+    rm cmake-3.30.1-linux-x86_64.sh
+
+ENV PATH=/code/cmake-3.30.1-linux-x86_64/bin:${PATH}
+
+RUN git clone --single-branch --branch v1.20.1 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime
+WORKDIR /code/onnxruntime
+# Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567
+# TODO: find a way to fix this without disabling algo caching
+COPY ./patches/* /tmp/
+RUN git apply /tmp/*.patch
+
+RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
+# Note: the `parallel` setting uses a substantial amount of RAM
+RUN ./build.sh --allow_running_as_root --config Release --build_wheel --update --build --parallel 17 --cmake_extra_defines\
+    ONNXRUNTIME_VERSION=1.20.1 --skip_tests --use_rocm --rocm_home=/opt/rocm
+RUN mv /code/onnxruntime/build/Linux/Release/dist/*.whl /opt/
+
 FROM builder-${DEVICE} AS builder

 ARG DEVICE
@@ -32,6 +60,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=uv.lock,target=uv.lock \
    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
    uv sync --frozen --extra ${DEVICE} --no-dev --no-editable --no-install-project --compile-bytecode --no-progress --active --link-mode copy
+RUN if [ "$DEVICE" = "rocm" ]; then \
+    uv pip install /opt/onnxruntime_rocm-*.whl; \
+    fi

 FROM python:3.11-slim-bookworm@sha256:614c8691ab74150465ec9123378cd4dde7a6e57be9e558c3108df40664667a4c AS prod-cpu

@@ -39,10 +70,10 @@ FROM prod-cpu AS prod-openvino

 RUN apt-get update && \
    apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \
-    wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
-    wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
-    wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
-    wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
+    wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
+    wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
+    wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
+    wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
    dpkg -i *.deb && \
    rm *.deb && \
    apt-get remove wget -yqq && \
@@ -59,6 +90,8 @@ COPY --from=builder-cuda /usr/local/bin/python3 /usr/local/bin/python3
 COPY --from=builder-cuda /usr/local/lib/python3.11 /usr/local/lib/python3.11
 COPY --from=builder-cuda /usr/local/lib/libpython3.11.so /usr/local/lib/libpython3.11.so

+FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS prod-rocm
+
 FROM prod-cpu AS prod-armnn

 ENV LD_LIBRARY_PATH=/opt/armnn
@@ -81,13 +114,12 @@ COPY --from=builder-armnn \

 FROM prod-cpu AS prod-rknn

-ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/v2.3.0/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so /usr/lib/
-
 FROM prod-${DEVICE} AS prod
+
 ARG DEVICE

 RUN apt-get update && \
-    apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ]; then echo "libmimalloc2.0"; fi) && \
+    apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then echo "libmimalloc2.0"; fi) && \
    apt-get autoremove -yqq && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
--- a/machine-learning/README.md
+++ b/machine-learning/README.md
@@ -7,7 +7,7 @@

 This project uses [uv](https://docs.astral.sh/uv/getting-started/installation/), so be sure to install it first.
 Running `uv sync --extra cpu` will install everything you need in an isolated virtual environment.
-CUDA and OpenVINO are supported as acceleration APIs. To use them, you can replace `--group cpu` with either of `--group cuda` or `--group openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required.
+CUDA, ROCM and OpenVINO are supported as acceleration APIs. To use them, you can replace `--extra cpu` with either of `--extra cuda`, `--extra rocm` or `--extra openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required.

 To add or remove dependencies, you can use the commands `uv add $PACKAGE_NAME` and `uv remove $PACKAGE_NAME`, respectively.
 Be sure to commit the `uv.lock` and `pyproject.toml` files with `uv lock` to reflect any changes in dependencies.
--- a/machine-learning/app/models/constants.py
+++ b/machine-learning/app/models/constants.py
@@ -75,7 +75,12 @@ _INSIGHTFACE_MODELS = {
 }


-SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"]
+SUPPORTED_PROVIDERS = [
+    "CUDAExecutionProvider",
+    "ROCMExecutionProvider",
+    "OpenVINOExecutionProvider",
+    "CPUExecutionProvider",
+]

 RKNN_SUPPORTED_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"]
 RKNN_COREMASK_SUPPORTED_SOCS = ["rk3576", "rk3588"]
--- a/machine-learning/app/sessions/ort.py
+++ b/machine-learning/app/sessions/ort.py
@@ -88,7 +88,7 @@ class OrtSession:
            match provider:
                case "CPUExecutionProvider":
                    options = {"arena_extend_strategy": "kSameAsRequested"}
-                case "CUDAExecutionProvider":
+                case "CUDAExecutionProvider" | "ROCMExecutionProvider":
                    options = {"arena_extend_strategy": "kSameAsRequested", "device_id": settings.device_id}
                case "OpenVINOExecutionProvider":
                    options = {
--- a/machine-learning/app/test_main.py
+++ b/machine-learning/app/test_main.py
@@ -180,6 +180,7 @@ class TestOrtSession:
    OV_EP = ["OpenVINOExecutionProvider", "CPUExecutionProvider"]
    CUDA_EP_OUT_OF_ORDER = ["CPUExecutionProvider", "CUDAExecutionProvider"]
    TRT_EP = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
+    ROCM_EP = ["ROCMExecutionProvider", "CPUExecutionProvider"]

    @pytest.mark.providers(CPU_EP)
    def test_sets_cpu_provider(self, providers: list[str]) -> None:
@@ -219,6 +220,12 @@ class TestOrtSession:

        assert session.providers == self.CUDA_EP

+    @pytest.mark.providers(ROCM_EP)
+    def test_uses_rocm(self, providers: list[str]) -> None:
+        session = OrtSession("ViT-B-32__openai")
+
+        assert session.providers == self.ROCM_EP
+
    def test_sets_provider_kwarg(self) -> None:
        providers = ["CUDAExecutionProvider"]
        session = OrtSession("ViT-B-32__openai", providers=providers)
@@ -235,19 +242,33 @@ class TestOrtSession:
            {"arena_extend_strategy": "kSameAsRequested"},
        ]

-    def test_sets_device_id_for_openvino(self) -> None:
+    def test_sets_provider_options_for_openvino(self) -> None:
+        model_path = "/cache/ViT-B-32__openai/textual/model.onnx"
        os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"

-        session = OrtSession("ViT-B-32__openai", providers=["OpenVINOExecutionProvider"])
+        session = OrtSession(model_path, providers=["OpenVINOExecutionProvider"])

-        assert session.provider_options[0]["device_type"] == "GPU.1"
+        assert session.provider_options == [
+            {
+                "device_type": "GPU.1",
+                "precision": "FP32",
+                "cache_dir": "/cache/ViT-B-32__openai/textual/openvino",
+            }
+        ]

-    def test_sets_device_id_for_cuda(self) -> None:
+    def test_sets_provider_options_for_cuda(self) -> None:
        os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"

        session = OrtSession("ViT-B-32__openai", providers=["CUDAExecutionProvider"])

-        assert session.provider_options[0]["device_id"] == "1"
+        assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}]
+
+    def test_sets_provider_options_for_rocm(self) -> None:
+        os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
+
+        session = OrtSession("ViT-B-32__openai", providers=["ROCMExecutionProvider"])
+
+        assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}]

    def test_sets_provider_options_kwarg(self) -> None:
        session = OrtSession(
--- a/machine-learning/patches/0001-disable-rocm-conv-algo-caching.patch
+++ b/machine-learning/patches/0001-disable-rocm-conv-algo-caching.patch
@@ -0,0 +1,179 @@
+commit 16839b58d9b3c3162a67ce5d776b36d4d24e801f
+Author: mertalev <101130780+mertalev@users.noreply.github.com>
+Date:   Wed Mar 5 11:25:38 2025 -0500
+
+    disable algo caching (attributed to @dmnieto in https://github.com/microsoft/onnxruntime/pull/19567)
+
+diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
+index d7f47d07a8..4060a2af52 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
+@@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+
+     if (w_dims_changed) {
+       s_.last_w_dims = gsl::make_span(w_dims);
+-      s_.cached_benchmark_fwd_results.clear();
+     }
+
+     ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
+@@ -277,35 +276,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+       HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
+       HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
+     }
+-
+-    if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
+-      miopenConvAlgoPerf_t perf;
+-      int algo_count = 1;
+-      const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
+-      static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
+-      size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
+-                                                                   : AlgoSearchWorkspaceSize;
+-      IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
+-      MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
+-          GetMiopenHandle(context),
+-          s_.x_tensor,
+-          s_.x_data,
+-          s_.w_desc,
+-          s_.w_data,
+-          s_.conv_desc,
+-          s_.y_tensor,
+-          s_.y_data,
+-          1,            // requestedAlgoCount
+-          &algo_count,  // returnedAlgoCount
+-          &perf,
+-          algo_search_workspace.get(),
+-          max_ws_size,
+-          false));  // Do not do exhaustive algo search.
+-      s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
+-    }
+-    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
+-    s_.fwd_algo = perf.fwd_algo;
+-    s_.workspace_bytes = perf.memory;
+   } else {
+     // set Y
+     s_.Y = context->Output(0, TensorShape(s_.y_dims));
+@@ -319,6 +289,31 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
+       s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
+     }
+   }
+
+  miopenConvAlgoPerf_t perf;
+  int algo_count = 1;
+  const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
+  static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
+  size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
+                                                                : AlgoSearchWorkspaceSize;
+  IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
+  MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
+      GetMiopenHandle(context),
+      s_.x_tensor,
+      s_.x_data,
+      s_.w_desc,
+      s_.w_data,
+      s_.conv_desc,
+      s_.y_tensor,
+      s_.y_data,
+      1,            // requestedAlgoCount
+      &algo_count,  // returnedAlgoCount
+      &perf,
+      algo_search_workspace.get(),
+      max_ws_size,
+      false));  // Do not do exhaustive algo search.
+  s_.fwd_algo = perf.fwd_algo;
+  s_.workspace_bytes = perf.memory;
+   return Status::OK();
+ }
+
+diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
+index bc9846203e..d54218f258 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv.h
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
+@@ -108,9 +108,6 @@ class lru_unordered_map {
+   list_type lru_list_;
+ };
+
+-// cached miopen descriptors
+-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
+-
+ template <typename AlgoPerfType>
+ struct MiopenConvState {
+   // if x/w dims changed, update algo and miopenTensors
+@@ -148,9 +145,6 @@ struct MiopenConvState {
+     decltype(AlgoPerfType().memory) memory;
+   };
+
+-  lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+-  lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
+-
+   // Some properties needed to support asymmetric padded Conv nodes
+   bool post_slicing_required;
+   TensorShapeVector slice_starts;
+diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+index 7447113fdf..a662e35b2e 100644
+--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+
+       if (w_dims_changed) {
+         s_.last_w_dims = gsl::make_span(w_dims);
+-        s_.cached_benchmark_bwd_results.clear();
+       }
+
+       ConvTransposeAttributes::Prepare p;
+@@ -126,35 +125,29 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
+       }
+
+       y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
+-
+-      if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
+-        IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+-
+-        miopenConvAlgoPerf_t perf;
+-        int algo_count = 1;
+-        MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
+-            GetMiopenHandle(context),
+-            s_.x_tensor,
+-            x_data,
+-            s_.w_desc,
+-            w_data,
+-            s_.conv_desc,
+-            s_.y_tensor,
+-            y_data,
+-            1,
+-            &algo_count,
+-            &perf,
+-            algo_search_workspace.get(),
+-            AlgoSearchWorkspaceSize,
+-            false));
+-        s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
+-      }
+-
+-      const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
+-      s_.bwd_data_algo = perf.bwd_data_algo;
+-      s_.workspace_bytes = perf.memory;
+     }
+
+    IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+    miopenConvAlgoPerf_t perf;
+    int algo_count = 1;
+    MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
+        GetMiopenHandle(context),
+        s_.x_tensor,
+        x_data,
+        s_.w_desc,
+        w_data,
+        s_.conv_desc,
+        s_.y_tensor,
+        y_data,
+        1,
+        &algo_count,
+        &perf,
+        algo_search_workspace.get(),
+        AlgoSearchWorkspaceSize,
+        false));
+    s_.bwd_data_algo = perf.bwd_data_algo;
+    s_.workspace_bytes = perf.memory;
+
+     // The following block will be executed in case there has been no change in the shapes of the
+     // input and the filter compared to the previous run
+     if (!y_data) {
--- a/machine-learning/patches/0002-target-gfx900-gfx1102.patch
+++ b/machine-learning/patches/0002-target-gfx900-gfx1102.patch
@@ -0,0 +1,13 @@
+diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
+index d90a2a355..bb1a7de12 100644
+--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
+@@ -295,7 +295,7 @@ if (onnxruntime_USE_ROCM)
+   endif()
+
+   if (NOT CMAKE_HIP_ARCHITECTURES)
+-    set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
+    set(CMAKE_HIP_ARCHITECTURES "gfx900;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx940;gfx941;gfx942;gfx1200;gfx1201")
+   endif()
+
+   file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
--- a/machine-learning/pyproject.toml
+++ b/machine-learning/pyproject.toml
@@ -52,6 +52,7 @@ cuda = ["onnxruntime-gpu>=1.17.0,<2"]
 openvino = ["onnxruntime-openvino>=1.17.1,<1.19.0"]
 armnn = ["onnxruntime>=1.15.0,<2"]
 rknn = ["onnxruntime>=1.15.0,<2", "rknn-toolkit-lite2>=2.3.0,<3"]
+rocm = []

 [tool.uv]
 compile-bytecode = true
--- a/machine-learning/start.sh
+++ b/machine-learning/start.sh
@@ -2,16 +2,19 @@

 echo "Initializing Immich ML $IMMICH_SOURCE_REF"

-lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2"
-# mimalloc seems to increase memory usage dramatically with openvino, need to investigate
 if ! [ "$DEVICE" = "openvino" ]; then
-	export LD_PRELOAD="$lib_path"
-	export LD_BIND_NOW=1
 	: "${MACHINE_LEARNING_WORKER_TIMEOUT:=120}"
 else
 	: "${MACHINE_LEARNING_WORKER_TIMEOUT:=300}"
 fi

+# mimalloc seems to increase memory usage dramatically with openvino, need to investigate
+if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then
+	lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2"
+	export LD_PRELOAD="$lib_path"
+	export LD_BIND_NOW=1
+fi
+
 : "${IMMICH_HOST:=[::]}"
 : "${IMMICH_PORT:=3003}"
 : "${MACHINE_LEARNING_WORKERS:=1}"
--- a/machine-learning/uv.lock
+++ b/machine-learning/uv.lock
@@ -1180,7 +1180,7 @@ requires-dist = [
    { name = "tokenizers", specifier = ">=0.15.0,<1.0" },
    { name = "uvicorn", extras = ["standard"], specifier = ">=0.22.0,<1.0" },
 ]
-provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn"]
+provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn", "rocm"]

 [package.metadata.requires-dev]
 dev = [