mirror of
https://github.com/immich-app/immich.git
synced 2026-02-04 08:49:01 +03:00
feat(ml): rocm (#16613)
* feat(ml): introduce support of onnxruntime-rocm for AMD GPU * try mutex for algo cache use OrtMutex * bump versions, run on mich use 3.12 use 1.19.2 * acquire lock before any changes can be made guard algo benchmark results mark mutex as mutable re-add /bin/sh (?) use 3.10 use 6.1.2 * use composite cache key 1.19.2 fix variable name fix variable reference aaaaaaaaaaaaaaaaaaaa * bump deps * disable algo caching * fix gha * try ubuntu runner * actually fix the gha * update patch * skip mimalloc preload for rocm * increase build threads * increase timeout for rocm * Revert "increase timeout for rocm" This reverts commit 2c4452f5d132198ed381a7b262b4a5cab5114b5f. * attempt migraphx * set migraphx_home * Revert "set migraphx_home" This reverts commit c121d3e48754b3bce100636f8d666deec58a44b7. * Revert "attempt migraphx" This reverts commit 521f9fb72dbe506dc6cb8faeb6494817d87265c6. * migraphx, take two * bump rocm * allow cpu * try only targeting migraphx * skip tests * migraph ❌ * known issues * target gfx900 and gfx1102 * mention `HSA_USE_SVM` * update lock * set device id for rocm --------- Co-authored-by: Mehdi GHESH <mehdi.ghesh@hotmail.fr>
This commit is contained in:
@@ -17,6 +17,34 @@ RUN mkdir /opt/armnn && \
|
||||
|
||||
FROM builder-cpu AS builder-rknn
|
||||
|
||||
# Warning: 25GiB+ disk space required to pull this image
|
||||
# TODO: find a way to reduce the image size
|
||||
FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS builder-rocm
|
||||
|
||||
WORKDIR /code
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends wget git python3.10-venv
|
||||
RUN wget -nv https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.sh && \
|
||||
chmod +x cmake-3.30.1-linux-x86_64.sh && \
|
||||
mkdir -p /code/cmake-3.30.1-linux-x86_64 && \
|
||||
./cmake-3.30.1-linux-x86_64.sh --skip-license --prefix=/code/cmake-3.30.1-linux-x86_64 && \
|
||||
rm cmake-3.30.1-linux-x86_64.sh
|
||||
|
||||
ENV PATH=/code/cmake-3.30.1-linux-x86_64/bin:${PATH}
|
||||
|
||||
RUN git clone --single-branch --branch v1.20.1 --recursive "https://github.com/Microsoft/onnxruntime" onnxruntime
|
||||
WORKDIR /code/onnxruntime
|
||||
# Fix for multi-threading based on comments in https://github.com/microsoft/onnxruntime/pull/19567
|
||||
# TODO: find a way to fix this without disabling algo caching
|
||||
COPY ./patches/* /tmp/
|
||||
RUN git apply /tmp/*.patch
|
||||
|
||||
RUN /bin/sh ./dockerfiles/scripts/install_common_deps.sh
|
||||
# Note: the `parallel` setting uses a substantial amount of RAM
|
||||
RUN ./build.sh --allow_running_as_root --config Release --build_wheel --update --build --parallel 17 --cmake_extra_defines\
|
||||
ONNXRUNTIME_VERSION=1.20.1 --skip_tests --use_rocm --rocm_home=/opt/rocm
|
||||
RUN mv /code/onnxruntime/build/Linux/Release/dist/*.whl /opt/
|
||||
|
||||
FROM builder-${DEVICE} AS builder
|
||||
|
||||
ARG DEVICE
|
||||
@@ -32,6 +60,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=uv.lock,target=uv.lock \
|
||||
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
||||
uv sync --frozen --extra ${DEVICE} --no-dev --no-editable --no-install-project --compile-bytecode --no-progress --active --link-mode copy
|
||||
RUN if [ "$DEVICE" = "rocm" ]; then \
|
||||
uv pip install /opt/onnxruntime_rocm-*.whl; \
|
||||
fi
|
||||
|
||||
FROM python:3.11-slim-bookworm@sha256:614c8691ab74150465ec9123378cd4dde7a6e57be9e558c3108df40664667a4c AS prod-cpu
|
||||
|
||||
@@ -39,10 +70,10 @@ FROM prod-cpu AS prod-openvino
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -yqq ocl-icd-libopencl1 wget && \
|
||||
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
|
||||
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
|
||||
wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
|
||||
wget https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
|
||||
wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-core_1.0.17384.11_amd64.deb && \
|
||||
wget -nv https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.17384.11/intel-igc-opencl_1.0.17384.11_amd64.deb && \
|
||||
wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/intel-opencl-icd_24.31.30508.7_amd64.deb && \
|
||||
wget -nv https://github.com/intel/compute-runtime/releases/download/24.31.30508.7/libigdgmm12_22.4.1_amd64.deb && \
|
||||
dpkg -i *.deb && \
|
||||
rm *.deb && \
|
||||
apt-get remove wget -yqq && \
|
||||
@@ -59,6 +90,8 @@ COPY --from=builder-cuda /usr/local/bin/python3 /usr/local/bin/python3
|
||||
COPY --from=builder-cuda /usr/local/lib/python3.11 /usr/local/lib/python3.11
|
||||
COPY --from=builder-cuda /usr/local/lib/libpython3.11.so /usr/local/lib/libpython3.11.so
|
||||
|
||||
FROM rocm/dev-ubuntu-22.04:6.3.4-complete AS prod-rocm
|
||||
|
||||
FROM prod-cpu AS prod-armnn
|
||||
|
||||
ENV LD_LIBRARY_PATH=/opt/armnn
|
||||
@@ -81,13 +114,12 @@ COPY --from=builder-armnn \
|
||||
|
||||
FROM prod-cpu AS prod-rknn
|
||||
|
||||
ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/v2.3.0/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so /usr/lib/
|
||||
|
||||
FROM prod-${DEVICE} AS prod
|
||||
|
||||
ARG DEVICE
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ]; then echo "libmimalloc2.0"; fi) && \
|
||||
apt-get install -y --no-install-recommends tini $(if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then echo "libmimalloc2.0"; fi) && \
|
||||
apt-get autoremove -yqq && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
This project uses [uv](https://docs.astral.sh/uv/getting-started/installation/), so be sure to install it first.
|
||||
Running `uv sync --extra cpu` will install everything you need in an isolated virtual environment.
|
||||
CUDA and OpenVINO are supported as acceleration APIs. To use them, you can replace `--group cpu` with either of `--group cuda` or `--group openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required.
|
||||
CUDA, ROCM and OpenVINO are supported as acceleration APIs. To use them, you can replace `--extra cpu` with either of `--extra cuda`, `--extra rocm` or `--extra openvino`. In the case of CUDA, a [compute capability](https://developer.nvidia.com/cuda-gpus) of 5.2 or higher is required.
|
||||
|
||||
To add or remove dependencies, you can use the commands `uv add $PACKAGE_NAME` and `uv remove $PACKAGE_NAME`, respectively.
|
||||
Be sure to commit the `uv.lock` and `pyproject.toml` files with `uv lock` to reflect any changes in dependencies.
|
||||
|
||||
@@ -75,7 +75,12 @@ _INSIGHTFACE_MODELS = {
|
||||
}
|
||||
|
||||
|
||||
SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"]
|
||||
SUPPORTED_PROVIDERS = [
|
||||
"CUDAExecutionProvider",
|
||||
"ROCMExecutionProvider",
|
||||
"OpenVINOExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
|
||||
RKNN_SUPPORTED_SOCS = ["rk3566", "rk3568", "rk3576", "rk3588"]
|
||||
RKNN_COREMASK_SUPPORTED_SOCS = ["rk3576", "rk3588"]
|
||||
|
||||
@@ -88,7 +88,7 @@ class OrtSession:
|
||||
match provider:
|
||||
case "CPUExecutionProvider":
|
||||
options = {"arena_extend_strategy": "kSameAsRequested"}
|
||||
case "CUDAExecutionProvider":
|
||||
case "CUDAExecutionProvider" | "ROCMExecutionProvider":
|
||||
options = {"arena_extend_strategy": "kSameAsRequested", "device_id": settings.device_id}
|
||||
case "OpenVINOExecutionProvider":
|
||||
options = {
|
||||
|
||||
@@ -180,6 +180,7 @@ class TestOrtSession:
|
||||
OV_EP = ["OpenVINOExecutionProvider", "CPUExecutionProvider"]
|
||||
CUDA_EP_OUT_OF_ORDER = ["CPUExecutionProvider", "CUDAExecutionProvider"]
|
||||
TRT_EP = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
ROCM_EP = ["ROCMExecutionProvider", "CPUExecutionProvider"]
|
||||
|
||||
@pytest.mark.providers(CPU_EP)
|
||||
def test_sets_cpu_provider(self, providers: list[str]) -> None:
|
||||
@@ -219,6 +220,12 @@ class TestOrtSession:
|
||||
|
||||
assert session.providers == self.CUDA_EP
|
||||
|
||||
@pytest.mark.providers(ROCM_EP)
|
||||
def test_uses_rocm(self, providers: list[str]) -> None:
|
||||
session = OrtSession("ViT-B-32__openai")
|
||||
|
||||
assert session.providers == self.ROCM_EP
|
||||
|
||||
def test_sets_provider_kwarg(self) -> None:
|
||||
providers = ["CUDAExecutionProvider"]
|
||||
session = OrtSession("ViT-B-32__openai", providers=providers)
|
||||
@@ -235,19 +242,33 @@ class TestOrtSession:
|
||||
{"arena_extend_strategy": "kSameAsRequested"},
|
||||
]
|
||||
|
||||
def test_sets_device_id_for_openvino(self) -> None:
|
||||
def test_sets_provider_options_for_openvino(self) -> None:
|
||||
model_path = "/cache/ViT-B-32__openai/textual/model.onnx"
|
||||
os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
|
||||
|
||||
session = OrtSession("ViT-B-32__openai", providers=["OpenVINOExecutionProvider"])
|
||||
session = OrtSession(model_path, providers=["OpenVINOExecutionProvider"])
|
||||
|
||||
assert session.provider_options[0]["device_type"] == "GPU.1"
|
||||
assert session.provider_options == [
|
||||
{
|
||||
"device_type": "GPU.1",
|
||||
"precision": "FP32",
|
||||
"cache_dir": "/cache/ViT-B-32__openai/textual/openvino",
|
||||
}
|
||||
]
|
||||
|
||||
def test_sets_device_id_for_cuda(self) -> None:
|
||||
def test_sets_provider_options_for_cuda(self) -> None:
|
||||
os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
|
||||
|
||||
session = OrtSession("ViT-B-32__openai", providers=["CUDAExecutionProvider"])
|
||||
|
||||
assert session.provider_options[0]["device_id"] == "1"
|
||||
assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}]
|
||||
|
||||
def test_sets_provider_options_for_rocm(self) -> None:
|
||||
os.environ["MACHINE_LEARNING_DEVICE_ID"] = "1"
|
||||
|
||||
session = OrtSession("ViT-B-32__openai", providers=["ROCMExecutionProvider"])
|
||||
|
||||
assert session.provider_options == [{"arena_extend_strategy": "kSameAsRequested", "device_id": "1"}]
|
||||
|
||||
def test_sets_provider_options_kwarg(self) -> None:
|
||||
session = OrtSession(
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
commit 16839b58d9b3c3162a67ce5d776b36d4d24e801f
|
||||
Author: mertalev <101130780+mertalev@users.noreply.github.com>
|
||||
Date: Wed Mar 5 11:25:38 2025 -0500
|
||||
|
||||
disable algo caching (attributed to @dmnieto in https://github.com/microsoft/onnxruntime/pull/19567)
|
||||
|
||||
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
|
||||
index d7f47d07a8..4060a2af52 100644
|
||||
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
|
||||
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
|
||||
@@ -127,7 +127,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
||||
|
||||
if (w_dims_changed) {
|
||||
s_.last_w_dims = gsl::make_span(w_dims);
|
||||
- s_.cached_benchmark_fwd_results.clear();
|
||||
}
|
||||
|
||||
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last));
|
||||
@@ -277,35 +276,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
||||
HIP_CALL_THROW(hipMalloc(&s_.b_zero, malloc_size));
|
||||
HIP_CALL_THROW(hipMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context)));
|
||||
}
|
||||
-
|
||||
- if (!s_.cached_benchmark_fwd_results.contains(x_dims_miopen)) {
|
||||
- miopenConvAlgoPerf_t perf;
|
||||
- int algo_count = 1;
|
||||
- const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
|
||||
- static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
|
||||
- size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
|
||||
- : AlgoSearchWorkspaceSize;
|
||||
- IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
|
||||
- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
|
||||
- GetMiopenHandle(context),
|
||||
- s_.x_tensor,
|
||||
- s_.x_data,
|
||||
- s_.w_desc,
|
||||
- s_.w_data,
|
||||
- s_.conv_desc,
|
||||
- s_.y_tensor,
|
||||
- s_.y_data,
|
||||
- 1, // requestedAlgoCount
|
||||
- &algo_count, // returnedAlgoCount
|
||||
- &perf,
|
||||
- algo_search_workspace.get(),
|
||||
- max_ws_size,
|
||||
- false)); // Do not do exhaustive algo search.
|
||||
- s_.cached_benchmark_fwd_results.insert(x_dims_miopen, {perf.fwd_algo, perf.memory});
|
||||
- }
|
||||
- const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
|
||||
- s_.fwd_algo = perf.fwd_algo;
|
||||
- s_.workspace_bytes = perf.memory;
|
||||
} else {
|
||||
// set Y
|
||||
s_.Y = context->Output(0, TensorShape(s_.y_dims));
|
||||
@@ -319,6 +289,31 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
|
||||
s_.y_data = reinterpret_cast<HipT*>(s_.Y->MutableData<T>());
|
||||
}
|
||||
}
|
||||
+
|
||||
+ miopenConvAlgoPerf_t perf;
|
||||
+ int algo_count = 1;
|
||||
+ const ROCMExecutionProvider* rocm_ep = static_cast<const ROCMExecutionProvider*>(this->Info().GetExecutionProvider());
|
||||
+ static constexpr int num_algos = MIOPEN_CONVOLUTION_FWD_ALGO_COUNT;
|
||||
+ size_t max_ws_size = rocm_ep->GetMiopenConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetMiopenHandle(context), s_, kAllAlgos, num_algos, rocm_ep->GetDeviceId())
|
||||
+ : AlgoSearchWorkspaceSize;
|
||||
+ IAllocatorUniquePtr<void> algo_search_workspace = GetTransientScratchBuffer<void>(max_ws_size);
|
||||
+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionForwardAlgorithm(
|
||||
+ GetMiopenHandle(context),
|
||||
+ s_.x_tensor,
|
||||
+ s_.x_data,
|
||||
+ s_.w_desc,
|
||||
+ s_.w_data,
|
||||
+ s_.conv_desc,
|
||||
+ s_.y_tensor,
|
||||
+ s_.y_data,
|
||||
+ 1, // requestedAlgoCount
|
||||
+ &algo_count, // returnedAlgoCount
|
||||
+ &perf,
|
||||
+ algo_search_workspace.get(),
|
||||
+ max_ws_size,
|
||||
+ false)); // Do not do exhaustive algo search.
|
||||
+ s_.fwd_algo = perf.fwd_algo;
|
||||
+ s_.workspace_bytes = perf.memory;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h
|
||||
index bc9846203e..d54218f258 100644
|
||||
--- a/onnxruntime/core/providers/rocm/nn/conv.h
|
||||
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
|
||||
@@ -108,9 +108,6 @@ class lru_unordered_map {
|
||||
list_type lru_list_;
|
||||
};
|
||||
|
||||
-// cached miopen descriptors
|
||||
-constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
|
||||
-
|
||||
template <typename AlgoPerfType>
|
||||
struct MiopenConvState {
|
||||
// if x/w dims changed, update algo and miopenTensors
|
||||
@@ -148,9 +145,6 @@ struct MiopenConvState {
|
||||
decltype(AlgoPerfType().memory) memory;
|
||||
};
|
||||
|
||||
- lru_unordered_map<TensorShapeVector, PerfFwdResultParams, vector_hash> cached_benchmark_fwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
|
||||
- lru_unordered_map<TensorShapeVector, PerfBwdResultParams, vector_hash> cached_benchmark_bwd_results{MAX_CACHED_ALGO_PERF_RESULTS};
|
||||
-
|
||||
// Some properties needed to support asymmetric padded Conv nodes
|
||||
bool post_slicing_required;
|
||||
TensorShapeVector slice_starts;
|
||||
diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
|
||||
index 7447113fdf..a662e35b2e 100644
|
||||
--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
|
||||
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
|
||||
@@ -76,7 +76,6 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
|
||||
|
||||
if (w_dims_changed) {
|
||||
s_.last_w_dims = gsl::make_span(w_dims);
|
||||
- s_.cached_benchmark_bwd_results.clear();
|
||||
}
|
||||
|
||||
ConvTransposeAttributes::Prepare p;
|
||||
@@ -126,35 +125,29 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
|
||||
}
|
||||
|
||||
y_data = reinterpret_cast<HipT*>(p.Y->MutableData<T>());
|
||||
-
|
||||
- if (!s_.cached_benchmark_bwd_results.contains(x_dims)) {
|
||||
- IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
|
||||
-
|
||||
- miopenConvAlgoPerf_t perf;
|
||||
- int algo_count = 1;
|
||||
- MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
|
||||
- GetMiopenHandle(context),
|
||||
- s_.x_tensor,
|
||||
- x_data,
|
||||
- s_.w_desc,
|
||||
- w_data,
|
||||
- s_.conv_desc,
|
||||
- s_.y_tensor,
|
||||
- y_data,
|
||||
- 1,
|
||||
- &algo_count,
|
||||
- &perf,
|
||||
- algo_search_workspace.get(),
|
||||
- AlgoSearchWorkspaceSize,
|
||||
- false));
|
||||
- s_.cached_benchmark_bwd_results.insert(x_dims, {perf.bwd_data_algo, perf.memory});
|
||||
- }
|
||||
-
|
||||
- const auto& perf = s_.cached_benchmark_bwd_results.at(x_dims);
|
||||
- s_.bwd_data_algo = perf.bwd_data_algo;
|
||||
- s_.workspace_bytes = perf.memory;
|
||||
}
|
||||
|
||||
+ IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
|
||||
+ miopenConvAlgoPerf_t perf;
|
||||
+ int algo_count = 1;
|
||||
+ MIOPEN_RETURN_IF_ERROR(miopenFindConvolutionBackwardDataAlgorithm(
|
||||
+ GetMiopenHandle(context),
|
||||
+ s_.x_tensor,
|
||||
+ x_data,
|
||||
+ s_.w_desc,
|
||||
+ w_data,
|
||||
+ s_.conv_desc,
|
||||
+ s_.y_tensor,
|
||||
+ y_data,
|
||||
+ 1,
|
||||
+ &algo_count,
|
||||
+ &perf,
|
||||
+ algo_search_workspace.get(),
|
||||
+ AlgoSearchWorkspaceSize,
|
||||
+ false));
|
||||
+ s_.bwd_data_algo = perf.bwd_data_algo;
|
||||
+ s_.workspace_bytes = perf.memory;
|
||||
+
|
||||
// The following block will be executed in case there has been no change in the shapes of the
|
||||
// input and the filter compared to the previous run
|
||||
if (!y_data) {
|
||||
13
machine-learning/patches/0002-target-gfx900-gfx1102.patch
Normal file
13
machine-learning/patches/0002-target-gfx900-gfx1102.patch
Normal file
@@ -0,0 +1,13 @@
|
||||
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
|
||||
index d90a2a355..bb1a7de12 100644
|
||||
--- a/cmake/CMakeLists.txt
|
||||
+++ b/cmake/CMakeLists.txt
|
||||
@@ -295,7 +295,7 @@ if (onnxruntime_USE_ROCM)
|
||||
endif()
|
||||
|
||||
if (NOT CMAKE_HIP_ARCHITECTURES)
|
||||
- set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
|
||||
+ set(CMAKE_HIP_ARCHITECTURES "gfx900;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102;gfx940;gfx941;gfx942;gfx1200;gfx1201")
|
||||
endif()
|
||||
|
||||
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
|
||||
@@ -52,6 +52,7 @@ cuda = ["onnxruntime-gpu>=1.17.0,<2"]
|
||||
openvino = ["onnxruntime-openvino>=1.17.1,<1.19.0"]
|
||||
armnn = ["onnxruntime>=1.15.0,<2"]
|
||||
rknn = ["onnxruntime>=1.15.0,<2", "rknn-toolkit-lite2>=2.3.0,<3"]
|
||||
rocm = []
|
||||
|
||||
[tool.uv]
|
||||
compile-bytecode = true
|
||||
|
||||
@@ -2,16 +2,19 @@
|
||||
|
||||
echo "Initializing Immich ML $IMMICH_SOURCE_REF"
|
||||
|
||||
lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2"
|
||||
# mimalloc seems to increase memory usage dramatically with openvino, need to investigate
|
||||
if ! [ "$DEVICE" = "openvino" ]; then
|
||||
export LD_PRELOAD="$lib_path"
|
||||
export LD_BIND_NOW=1
|
||||
: "${MACHINE_LEARNING_WORKER_TIMEOUT:=120}"
|
||||
else
|
||||
: "${MACHINE_LEARNING_WORKER_TIMEOUT:=300}"
|
||||
fi
|
||||
|
||||
# mimalloc seems to increase memory usage dramatically with openvino, need to investigate
|
||||
if ! [ "$DEVICE" = "openvino" ] && ! [ "$DEVICE" = "rocm" ]; then
|
||||
lib_path="/usr/lib/$(arch)-linux-gnu/libmimalloc.so.2"
|
||||
export LD_PRELOAD="$lib_path"
|
||||
export LD_BIND_NOW=1
|
||||
fi
|
||||
|
||||
: "${IMMICH_HOST:=[::]}"
|
||||
: "${IMMICH_PORT:=3003}"
|
||||
: "${MACHINE_LEARNING_WORKERS:=1}"
|
||||
|
||||
2
machine-learning/uv.lock
generated
2
machine-learning/uv.lock
generated
@@ -1180,7 +1180,7 @@ requires-dist = [
|
||||
{ name = "tokenizers", specifier = ">=0.15.0,<1.0" },
|
||||
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.22.0,<1.0" },
|
||||
]
|
||||
provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn"]
|
||||
provides-extras = ["cpu", "cuda", "openvino", "armnn", "rknn", "rocm"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [
|
||||
|
||||
Reference in New Issue
Block a user