Added support for AMD GPUs in "docker run --gpus".

Added backend code to support the exact same interface
used today for Nvidia GPUs, allowing customers to use
the same docker commands for both Nvidia and AMD GPUs.

Signed-off-by: Sudheendra Gopinath <sudheendra.gopinath@amd.com>

Reused common functions from nvidia_linux.go.

Removed duplicate code in amd_linux.go by reusing
the init() and countToDevices() functions in
nvidia_linux.go. AMD driver is registered in init().

Signed-off-by: Sudheendra Gopinath <sudheendra.gopinath@amd.com>

Renamed amd-container-runtime constant

Signed-off-by: Sudheendra Gopinath <sudheendra.gopinath@amd.com>

Removed empty branch to keep linter happy.

Also renamed amd_linux.go to gpu_amd_linux.go.

Signed-off-by: Sudheendra Gopinath <sudheendra.gopinath@amd.com>

Renamed nvidia_linux.go and gpu_amd_linux.go.

Signed-off-by: Sudheendra Gopinath <sudheendra.gopinath@amd.com>
This commit is contained in:
Sudheendra Gopinath
2025-05-10 06:42:48 +00:00
parent 325076df0c
commit e32715ec03
2 changed files with 51 additions and 11 deletions

View File

@@ -0,0 +1,27 @@
package daemon
import (
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
)
func setAMDGPUs(s *specs.Spec, dev *deviceInstance) error {
req := dev.req
if req.Count != 0 && len(req.DeviceIDs) > 0 {
return errConflictCountDeviceIDs
}
switch {
case len(req.DeviceIDs) > 0:
s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES="+strings.Join(req.DeviceIDs, ","))
case req.Count > 0:
s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES="+countToDevices(req.Count))
case req.Count < 0:
s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES=all")
case req.Count == 0:
s.Process.Env = append(s.Process.Env, "AMD_VISIBLE_DEVICES=void")
}
return nil
}

View File

@@ -17,7 +17,10 @@ import (
var errConflictCountDeviceIDs = errors.New("cannot set both Count and DeviceIDs on device request")
const nvidiaHook = "nvidia-container-runtime-hook"
const (
nvidiaHook = "nvidia-container-runtime-hook"
amdContainerRuntimeExecutableName = "amd-container-runtime"
)
// These are NVIDIA-specific capabilities stolen from github.com/containerd/containerd/contrib/nvidia.allCaps
var allNvidiaCaps = map[nvidia.Capability]struct{}{
@@ -30,19 +33,29 @@ var allNvidiaCaps = map[nvidia.Capability]struct{}{
}
func init() {
if _, err := exec.LookPath(nvidiaHook); err != nil {
// do not register Nvidia driver if helper binary is not present.
// Register Nvidia driver if Nvidia helper binary is present.
if _, err := exec.LookPath(nvidiaHook); err == nil {
capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}}
for c := range allNvidiaCaps {
capset[string(c)] = struct{}{}
}
registerDeviceDriver("nvidia", &deviceDriver{
capset: capset,
updateSpec: setNvidiaGPUs,
})
return
}
capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}}
nvidiaDriver := &deviceDriver{
capset: capset,
updateSpec: setNvidiaGPUs,
// Register AMD driver if AMD helper binary is present.
if _, err := exec.LookPath(amdContainerRuntimeExecutableName); err == nil {
registerDeviceDriver("amd", &deviceDriver{
capset: capabilities.Set{"gpu": struct{}{}, "amd": struct{}{}},
updateSpec: setAMDGPUs,
})
return
}
for c := range allNvidiaCaps {
nvidiaDriver.capset[string(c)] = struct{}{}
}
registerDeviceDriver("nvidia", nvidiaDriver)
// No "gpu" capability
}
func setNvidiaGPUs(s *specs.Spec, dev *deviceInstance) error {