feat(ml): rocm (#16613)

* feat(ml): introduce support of onnxruntime-rocm for AMD GPU

* try mutex for algo cache

use OrtMutex

* bump versions, run on mich

use 3.12

use 1.19.2

* acquire lock before any changes can be made

guard algo benchmark results

mark mutex as mutable

re-add /bin/sh (?)

use 3.10

use 6.1.2

* use composite cache key

1.19.2

fix variable name

fix variable reference

aaaaaaaaaaaaaaaaaaaa

* bump deps

* disable algo caching

* fix gha

* try ubuntu runner

* actually fix the gha

* update patch

* skip mimalloc preload for rocm

* increase build threads

* increase timeout for rocm

* Revert "increase timeout for rocm"

This reverts commit 2c4452f5d132198ed381a7b262b4a5cab5114b5f.

* attempt migraphx

* set migraphx_home

* Revert "set migraphx_home"

This reverts commit c121d3e48754b3bce100636f8d666deec58a44b7.

* Revert "attempt migraphx"

This reverts commit 521f9fb72dbe506dc6cb8faeb6494817d87265c6.

* migraphx, take two

* bump rocm

* allow cpu

* try only targeting migraphx

* skip tests

* migraph 

* known issues

* target gfx900 and gfx1102

* mention `HSA_USE_SVM`

* update lock

* set device id for rocm

---------

Co-authored-by: Mehdi GHESH <mehdi.ghesh@hotmail.fr>
This commit is contained in:
Mert
2025-03-17 17:08:19 -04:00
committed by GitHub
parent 6a40aa83b7
commit 2b37caba03
17 changed files with 340 additions and 50 deletions

View File

@@ -17,6 +17,34 @@ RUN mkdir /opt/armnn && \
FROM builder-cpu AS builder-rknn
# Warning: 25GiB+ disk space required to pull this image
# TODO: find a way to reduce the image size
FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS builder-rocm
WORKDIR /code
RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.sh && \
chmod +x cmake-3.30.1-linux-x86_64.sh && \
mkdir -p /code/cmake-3.30.1-linux-x86_64 && \
./cmake-3.30.1-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.30.1-linux-x86_64 && \
rm cmake-3.30.1-linux-x86_64.sh
ENV PATH=/code/cmake-3.30.1-linux-x86_64/bin:${PATH}
RUN git clone --single-branch --branch v1.20.1 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime
WORKDIR /code/onnxruntime
# Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567
# TODO: find a way to fix this without disabling algo caching
COPY ./patches/* /tmp/
RUN git apply /tmp/*.patch
RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
# Note: the `parallel` setting uses a substantial amount of RAM
RUN ./build.sh --allow_running_as_root --config Release --build_wheel --update --build --parallel 17 --cmake_extra_defines\
ONNXRUNTIME_VERSION=1.20.1 --skip_tests --use_rocm --rocm_home=/opt/rocm
RUN mv /code/onnxruntime/build/Linux/Release/dist/*.whl /opt/
FROM builder-${DEVICE} AS builder
ARG DEVICE
@@ -32,6 +60,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --extra ${DEVICE} --no-dev --no-editable --no-install-project --compile-bytecode --no-progress --active --link-mode copy
RUN if [ "$DEVICE" = "rocm" ]; then \
uv pip install /opt/onnxruntime_rocm-*.whl; \
fi
FROM python:3.11-slim-bookworm@sha256:614c8691ab74150465ec9123378cd4dde7a6e57be9e558c3108df40664667a4c AS prod-cpu
@@ -39,10 +70,10 @@ FROM prod-cpu AS prod-openvino
RUN apt-get update && \
apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
dpkg -i *.deb && \
rm *.deb && \
apt-get remove wget -yqq && \
@@ -59,6 +90,8 @@ COPY --from=builder-cuda /usr/local/bin/python3 /usr/local/bin/python3
COPY --from=builder-cuda /usr/local/lib/python3.11 /usr/local/lib/python3.11
COPY --from=builder-cuda /usr/local/lib/libpython3.11.so /usr/local/lib/libpython3.11.so
FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS prod-rocm
FROM prod-cpu AS prod-armnn
ENV LD_LIBRARY_PATH=/opt/armnn
@@ -81,13 +114,12 @@ COPY --from=builder-armnn \
FROM prod-cpu AS prod-rknn
ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/v2.3.0/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so /usr/lib/
FROM prod-${DEVICE} AS prod
ARG DEVICE
RUN apt-get update && \
apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ]; then echo "libmimalloc2.0"; fi) && \
apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then echo "libmimalloc2.0"; fi) && \
apt-get autoremove -yqq && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

View File

@@ -7,7 +7,7 @@
This project uses [uv](https://docs.astral.sh/uv/getting-started/installation/), so be sure to install it first.
Running `uv sync --extra cpu` will install everything you need in an isolated virtual environment.
CUDA and OpenVINO are supported as acceleration APIs. To use them, you can replace `--group cpu` with either of `--group cuda` or `--group openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required.
CUDA, ROCM and OpenVINO are supported as acceleration APIs. To use them, you can replace `--extra cpu` with either of `--extra cuda`, `--extra rocm` or `--extra openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required.
To add or remove dependencies, you can use the commands `uv add $PACKAGE_NAME` and `uv remove $PACKAGE_NAME`, respectively.
Be sure to commit the `uv.lock` and `pyproject.toml` files with `uv lock` to reflect any changes in dependencies.

View File

@@ -75,7 +75,12 @@ _INSIGHTFACE_MODELS = {
}
SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"]
SUPPORTED_PROVIDERS = [
"CUDAExecutionProvider",
"ROCMExecutionProvider",
"OpenVINOExecutionProvider",
"CPUExecutionProvider",
]
RKNN_SUPPORTED_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"]
RKNN_COREMASK_SUPPORTED_SOCS = ["rk3576", "rk3588"]

View File

@@ -88,7 +88,7 @@ class OrtSession:
match provider:
case "CPUExecutionProvider":
options = {"arena_extend_strategy": "kSameAsRequested"}
case "CUDAExecutionProvider":
case "CUDAExecutionProvider" | "ROCMExecutionProvider":
options = {"arena_extend_strategy": "kSameAsRequested", "device_id": settings.device_id}
case "OpenVINOExecutionProvider":
options = {

View File

@@ -180,6 +180,7 @@ class TestOrtSession:
OV_EP = ["OpenVINOExecutionProvider", "CPUExecutionProvider"]
CUDA_EP_OUT_OF_ORDER = ["CPUExecutionProvider", "CUDAExecutionProvider"]
TRT_EP = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
ROCM_EP = ["ROCMExecutionProvider", "CPUExecutionProvider"]
@pytest.mark.providers(CPU_EP)
def test_sets_cpu_provider(self, providers: list[str]) -> None:
@@ -219,6 +220,12 @@ class TestOrtSession:
assert session.providers == self.CUDA_EP
@pytest.mark.providers(ROCM_EP)
def test_uses_rocm(self, providers: list[str]) -> None:
session = OrtSession("ViT-B-32__openai")
assert session.providers == self.ROCM_EP
def test_sets_provider_kwarg(self) -> None:
providers = ["CUDAExecutionProvider"]
session = OrtSession("ViT-B-32__openai", providers=providers)
@@ -235,19 +242,33 @@ class TestOrtSession:
{"arena_extend_strategy": "kSameAsRequested"},
]
def test_sets_device_id_for_openvino(self) -> None:
def test_sets_provider_options_for_openvino(self) -> None:
model_path = "/cache/ViT-B-32__openai/textual/model.onnx"
os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
session = OrtSession("ViT-B-32__openai", providers=["OpenVINOExecutionProvider"])
session = OrtSession(model_path, providers=["OpenVINOExecutionProvider"])
assert session.provider_options[0]["device_type"] == "GPU.1"
assert session.provider_options == [
{
"device_type": "GPU.1",
"precision": "FP32",
"cache_dir": "/cache/ViT-B-32__openai/textual/openvino",
}
]
def test_sets_device_id_for_cuda(self) -> None:
def test_sets_provider_options_for_cuda(self) -> None:
os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
session = OrtSession("ViT-B-32__openai", providers=["CUDAExecutionProvider"])
assert session.provider_options[0]["device_id"] == "1"
assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}]
def test_sets_provider_options_for_rocm(self) -> None:
os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
session = OrtSession("ViT-B-32__openai", providers=["ROCMExecutionProvider"])
assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}]
def test_sets_provider_options_kwarg(self) -> None:
session = OrtSession(

View File

@@ -0,0 +1,179 @@
commit 16839b58d9b3c3162a67ce5d776b36d4d24e801f
Author: mertalev <101130780+mertalev@users.noreply.github.com>
Date: Wed Mar 5 11:25:38 2025 -0500
disable algo caching (attributed to @dmnieto in https://github.com/microsoft/onnxruntime/pull/19567)
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index d7f47d07a8..4060a2af52 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
if (w_dims_changed) {
s_.last_w_dims = gsl::make_span(w_dims);
- s_.cached_benchmark_fwd_results.clear();
}
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
@@ -277,35 +276,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
}
-
- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
- miopenConvAlgoPerf_t perf;
- int algo_count = 1;
- const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
- static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
- size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
- : AlgoSearchWorkspaceSize;
- IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
- GetMiopenHandle(context),
- s_.x_tensor,
- s_.x_data,
- s_.w_desc,
- s_.w_data,
- s_.conv_desc,
- s_.y_tensor,
- s_.y_data,
- 1, // requestedAlgoCount
- &algo_count, // returnedAlgoCount
- &perf,
- algo_search_workspace.get(),
- max_ws_size,
- false)); // Do not do exhaustive algo search.
- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
- }
- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
- s_.fwd_algo = perf.fwd_algo;
- s_.workspace_bytes = perf.memory;
} else {
// set Y
s_.Y = context->Output(0, TensorShape(s_.y_dims));
@@ -319,6 +289,31 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
}
}
+
+ miopenConvAlgoPerf_t perf;
+ int algo_count = 1;
+ const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
+ static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
+ size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
+ : AlgoSearchWorkspaceSize;
+ IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
+ GetMiopenHandle(context),
+ s_.x_tensor,
+ s_.x_data,
+ s_.w_desc,
+ s_.w_data,
+ s_.conv_desc,
+ s_.y_tensor,
+ s_.y_data,
+ 1, // requestedAlgoCount
+ &algo_count, // returnedAlgoCount
+ &perf,
+ algo_search_workspace.get(),
+ max_ws_size,
+ false)); // Do not do exhaustive algo search.
+ s_.fwd_algo = perf.fwd_algo;
+ s_.workspace_bytes = perf.memory;
return Status::OK();
}
diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
index bc9846203e..d54218f258 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.h
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
@@ -108,9 +108,6 @@ class lru_unordered_map {
list_type lru_list_;
};
-// cached miopen descriptors
-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
-
template <typename AlgoPerfType>
struct MiopenConvState {
// if x/w dims changed, update algo and miopenTensors
@@ -148,9 +145,6 @@ struct MiopenConvState {
decltype(AlgoPerfType().memory) memory;
};
- lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
- lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
-
// Some properties needed to support asymmetric padded Conv nodes
bool post_slicing_required;
TensorShapeVector slice_starts;
diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
index 7447113fdf..a662e35b2e 100644
--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
if (w_dims_changed) {
s_.last_w_dims = gsl::make_span(w_dims);
- s_.cached_benchmark_bwd_results.clear();
}
ConvTransposeAttributes::Prepare p;
@@ -126,35 +125,29 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
}
y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
-
- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
- IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
-
- miopenConvAlgoPerf_t perf;
- int algo_count = 1;
- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
- GetMiopenHandle(context),
- s_.x_tensor,
- x_data,
- s_.w_desc,
- w_data,
- s_.conv_desc,
- s_.y_tensor,
- y_data,
- 1,
- &algo_count,
- &perf,
- algo_search_workspace.get(),
- AlgoSearchWorkspaceSize,
- false));
- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
- }
-
- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
- s_.bwd_data_algo = perf.bwd_data_algo;
- s_.workspace_bytes = perf.memory;
}
+ IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+ miopenConvAlgoPerf_t perf;
+ int algo_count = 1;
+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
+ GetMiopenHandle(context),
+ s_.x_tensor,
+ x_data,
+ s_.w_desc,
+ w_data,
+ s_.conv_desc,
+ s_.y_tensor,
+ y_data,
+ 1,
+ &algo_count,
+ &perf,
+ algo_search_workspace.get(),
+ AlgoSearchWorkspaceSize,
+ false));
+ s_.bwd_data_algo = perf.bwd_data_algo;
+ s_.workspace_bytes = perf.memory;
+
// The following block will be executed in case there has been no change in the shapes of the
// input and the filter compared to the previous run
if (!y_data) {

View File

@@ -0,0 +1,13 @@
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d90a2a355..bb1a7de12 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -295,7 +295,7 @@ if (onnxruntime_USE_ROCM)
endif()
if (NOT CMAKE_HIP_ARCHITECTURES)
- set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
+ set(CMAKE_HIP_ARCHITECTURES "gfx900;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx940;gfx941;gfx942;gfx1200;gfx1201")
endif()
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)

View File

@@ -52,6 +52,7 @@ cuda = ["onnxruntime-gpu>=1.17.0,<2"]
openvino = ["onnxruntime-openvino>=1.17.1,<1.19.0"]
armnn = ["onnxruntime>=1.15.0,<2"]
rknn = ["onnxruntime>=1.15.0,<2", "rknn-toolkit-lite2>=2.3.0,<3"]
rocm = []
[tool.uv]
compile-bytecode = true

View File

@@ -2,16 +2,19 @@
echo "Initializing Immich ML $IMMICH_SOURCE_REF"
lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2"
# mimalloc seems to increase memory usage dramatically with openvino, need to investigate
if ! [ "$DEVICE" = "openvino" ]; then
export LD_PRELOAD="$lib_path"
export LD_BIND_NOW=1
: "${MACHINE_LEARNING_WORKER_TIMEOUT:=120}"
else
: "${MACHINE_LEARNING_WORKER_TIMEOUT:=300}"
fi
# mimalloc seems to increase memory usage dramatically with openvino, need to investigate
if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then
lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2"
export LD_PRELOAD="$lib_path"
export LD_BIND_NOW=1
fi
: "${IMMICH_HOST:=[::]}"
: "${IMMICH_PORT:=3003}"
: "${MACHINE_LEARNING_WORKERS:=1}"

View File

@@ -1180,7 +1180,7 @@ requires-dist = [
{ name = "tokenizers", specifier = ">=0.15.0,<1.0" },
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.22.0,<1.0" },
]
provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn"]
provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn", "rocm"]
[package.metadata.requires-dev]
dev = [