From e32715ec031d627dd90dda5cbc39ca71ae1ef969 Mon Sep 17 00:00:00 2001 From: Sudheendra Gopinath Date: Sat, 10 May 2025 06:42:48 +0000 Subject: [PATCH] Added support for AMD GPUs in "docker run --gpus". Added backend code to support the exact same interface used today for Nvidia GPUs, allowing customers to use the same docker commands for both Nvidia and AMD GPUs. Signed-off-by: Sudheendra Gopinath Reused common functions from nvidia_linux.go. Removed duplicate code in amd_linux.go by reusing the init() and countToDevices() functions in nvidia_linux.go. AMD driver is registered in init(). Signed-off-by: Sudheendra Gopinath Renamed amd-container-runtime constant Signed-off-by: Sudheendra Gopinath Removed empty branch to keep linter happy. Also renamed amd_linux.go to gpu_amd_linux.go. Signed-off-by: Sudheendra Gopinath Renamed nvidia_linux.go and gpu_amd_linux.go. Signed-off-by: Sudheendra Gopinath --- daemon/devices_amd_linux.go | 27 ++++++++++++++ ...vidia_linux.go => devices_nvidia_linux.go} | 35 +++++++++++++------ 2 files changed, 51 insertions(+), 11 deletions(-) create mode 100644 daemon/devices_amd_linux.go rename daemon/{nvidia_linux.go => devices_nvidia_linux.go} (78%) diff --git a/daemon/devices_amd_linux.go b/daemon/devices_amd_linux.go new file mode 100644 index 0000000000..a728c7074e --- /dev/null +++ b/daemon/devices_amd_linux.go @@ -0,0 +1,27 @@ +package daemon + +import ( + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +func setAMDGPUs(s *specs.Spec, dev *deviceInstance) error { + req := dev.req + if req.Count != 0 && len(req.DeviceIDs) > 0 { + return errConflictCountDeviceIDs + } + + switch { + case len(req.DeviceIDs) > 0: + s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES="+strings.Join(req.DeviceIDs, ",")) + case req.Count > 0: + s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES="+countToDevices(req.Count)) + case req.Count < 0: + s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES=all") + case req.Count == 0: + s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES=void") + } + + return nil +} diff --git a/daemon/nvidia_linux.go b/daemon/devices_nvidia_linux.go similarity index 78% rename from daemon/nvidia_linux.go rename to daemon/devices_nvidia_linux.go index abc6b4a351..8a30343134 100644 --- a/daemon/nvidia_linux.go +++ b/daemon/devices_nvidia_linux.go @@ -17,7 +17,10 @@ import ( var errConflictCountDeviceIDs = errors.New("cannot set both Count and DeviceIDs on device request") -const nvidiaHook = "nvidia-container-runtime-hook" +const ( + nvidiaHook = "nvidia-container-runtime-hook" + amdContainerRuntimeExecutableName = "amd-container-runtime" +) // These are NVIDIA-specific capabilities stolen from github.com/containerd/containerd/contrib/nvidia.allCaps var allNvidiaCaps = map[nvidia.Capability]struct{}{ @@ -30,19 +33,29 @@ var allNvidiaCaps = map[nvidia.Capability]struct{}{ } func init() { - if _, err := exec.LookPath(nvidiaHook); err != nil { - // do not register Nvidia driver if helper binary is not present. + // Register Nvidia driver if Nvidia helper binary is present. + if _, err := exec.LookPath(nvidiaHook); err == nil { + capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}} + for c := range allNvidiaCaps { + capset[string(c)] = struct{}{} + } + registerDeviceDriver("nvidia", &deviceDriver{ + capset: capset, + updateSpec: setNvidiaGPUs, + }) return } - capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}} - nvidiaDriver := &deviceDriver{ - capset: capset, - updateSpec: setNvidiaGPUs, + + // Register AMD driver if AMD helper binary is present. + if _, err := exec.LookPath(amdContainerRuntimeExecutableName); err == nil { + registerDeviceDriver("amd", &deviceDriver{ + capset: capabilities.Set{"gpu": struct{}{}, "amd": struct{}{}}, + updateSpec: setAMDGPUs, + }) + return } - for c := range allNvidiaCaps { - nvidiaDriver.capset[string(c)] = struct{}{} - } - registerDeviceDriver("nvidia", nvidiaDriver) + + // No "gpu" capability } func setNvidiaGPUs(s *specs.Spec, dev *deviceInstance) error {