Skip to content

Commit b7906ea

Browse files
authored
[LEP-3790] fix(nvidia/fabric-manager): skip V3 fabric API on older drivers to prevent crash (#1183)
nvmlDeviceGetGpuFabricInfoV was introduced in driver 550. Calling it on older drivers (e.g., 535.x) causes a symbol lookup error that crashes the process with exit code 127 at the dynamic linker level. This fix: - Adds MinDriverVersionForV3FabricAPI constant (550) with documentation - Passes driver major version to device creation via WithDriverMajor() - Checks driver version before calling V3 API in GetFabricState() - Logs a warning (once) when skipping V3 API due to old driver - Falls back to V1 API (GetGpuFabricInfo) on older drivers The V1 API provides basic fabric info but lacks health metrics (HealthMask, HealthSummary) which are set to defaults. Reference: https://docs.nvidia.com/deploy/nvml-api/change-log.html Before > gpud scan > > {"level":"info","ts":"2025-12-18T16:17:35.540+0530","caller":"fabric-manager/component.go:279","msg":"checking nvidia fabric manager"} > gpud: symbol lookup error: gpud: undefined symbol: nvmlDeviceGetGpuFabricInfoV After > gpud scan > > {"level":"info","ts":"2025-12-18T17:30:41.418+0530","caller":"fabric-manager/component.go:279","msg":"checking nvidia fabric manager"} {"level":"warn","ts":"2025-12-18T17:30:41.467+0530","caller":"device/fabric_state.go:311","msg":"skipping fabric state V3 API (nvmlDeviceGetGpuFabricInfoV) due to old driver version; V3 API requires driver >= 550","driverMajor":535,"minRequired":550} health states: ```json "component": "accelerator-nvidia-fabric-manager", "states": [ { "time": "2025-12-18T12:01:59Z", "component": "accelerator-nvidia-fabric-manager", "name": "accelerator-nvidia-fabric-manager", "health": "Healthy", "reason": "fabric manager found and active", ``` Signed-off-by: Gyuho Lee <[email protected]>
1 parent 98f0311 commit b7906ea

File tree

5 files changed

+411
-17
lines changed

5 files changed

+411
-17
lines changed

pkg/nvidia-query/nvml/device/device.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,19 @@ type Device interface {
2020

2121
var _ Device = &nvDevice{}
2222

23+
// MinDriverVersionForV3FabricAPI is the minimum NVIDIA driver major version
24+
// required for nvmlDeviceGetGpuFabricInfoV (V3 fabric state API).
25+
// This API was introduced in driver 550 (see NVML changelog:
26+
// https://docs.nvidia.com/deploy/nvml-api/change-log.html).
27+
// Calling this function on older drivers (e.g., 535.x) causes a symbol lookup
28+
// error that crashes the process.
29+
const MinDriverVersionForV3FabricAPI = 550
30+
2331
type nvDevice struct {
2432
device.Device
25-
busID string
26-
uuid string
33+
busID string
34+
uuid string
35+
driverMajor int // Driver major version, used to gate V3 fabric API calls
2736
}
2837

2938
func (d *nvDevice) PCIBusID() string {
@@ -45,7 +54,7 @@ func New(dev device.Device, busID string, opts ...OpOption) Device {
4554
}
4655

4756
// Create the base device
48-
baseDevice := &nvDevice{Device: dev, busID: busID, uuid: uuid}
57+
baseDevice := &nvDevice{Device: dev, busID: busID, uuid: uuid, driverMajor: op.DriverMajor}
4958

5059
// If ANY test flags are set, wrap with testDevice
5160
if op.GPULost || op.GPURequiresReset || op.FabricHealthUnhealthy {
@@ -62,6 +71,9 @@ func New(dev device.Device, busID string, opts ...OpOption) Device {
6271

6372
// Op struct holds options for device creation
6473
type Op struct {
74+
// DriverMajor is the major version of the NVIDIA driver.
75+
// Used to gate V3 fabric API calls which require driver >= 550.
76+
DriverMajor int
6577
// GPULost indicates that all device methods should return nvml.ERROR_GPU_IS_LOST
6678
GPULost bool
6779
// GPURequiresReset indicates that all device methods should return nvml.ERROR_RESET_REQUIRED
@@ -100,3 +112,11 @@ func WithFabricHealthUnhealthy() OpOption {
100112
op.FabricHealthUnhealthy = true
101113
}
102114
}
115+
116+
// WithDriverMajor returns an OpOption that sets the driver major version.
117+
// This is used to gate V3 fabric API calls which require driver >= 550.
118+
func WithDriverMajor(major int) OpOption {
119+
return func(op *Op) {
120+
op.DriverMajor = major
121+
}
122+
}
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
package device
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
"github.com/stretchr/testify/require"
8+
)
9+
10+
func TestMinDriverVersionForV3FabricAPI(t *testing.T) {
11+
t.Parallel()
12+
13+
// Verify the constant is set correctly per NVIDIA documentation
14+
// nvmlDeviceGetGpuFabricInfoV was introduced in driver 550
15+
// See: https://docs.nvidia.com/deploy/nvml-api/change-log.html
16+
assert.Equal(t, 550, MinDriverVersionForV3FabricAPI,
17+
"MinDriverVersionForV3FabricAPI should be 550 per NVIDIA NVML changelog")
18+
}
19+
20+
func TestWithDriverMajor(t *testing.T) {
21+
t.Parallel()
22+
23+
tests := []struct {
24+
name string
25+
driverMajor int
26+
expectedMajor int
27+
}{
28+
{
29+
name: "driver version 535 (old driver)",
30+
driverMajor: 535,
31+
expectedMajor: 535,
32+
},
33+
{
34+
name: "driver version 550 (minimum for V3 API)",
35+
driverMajor: 550,
36+
expectedMajor: 550,
37+
},
38+
{
39+
name: "driver version 560 (newer driver)",
40+
driverMajor: 560,
41+
expectedMajor: 560,
42+
},
43+
{
44+
name: "driver version 0 (uninitialized)",
45+
driverMajor: 0,
46+
expectedMajor: 0,
47+
},
48+
}
49+
50+
for _, tt := range tests {
51+
tt := tt
52+
t.Run(tt.name, func(t *testing.T) {
53+
t.Parallel()
54+
55+
op := &Op{}
56+
opt := WithDriverMajor(tt.driverMajor)
57+
opt(op)
58+
59+
assert.Equal(t, tt.expectedMajor, op.DriverMajor)
60+
})
61+
}
62+
}
63+
64+
func TestOpApplyOpts(t *testing.T) {
65+
t.Parallel()
66+
67+
t.Run("multiple options applied correctly", func(t *testing.T) {
68+
t.Parallel()
69+
70+
op := &Op{}
71+
opts := []OpOption{
72+
WithDriverMajor(550),
73+
WithGPULost(),
74+
WithGPURequiresReset(),
75+
WithFabricHealthUnhealthy(),
76+
}
77+
op.applyOpts(opts)
78+
79+
assert.Equal(t, 550, op.DriverMajor)
80+
assert.True(t, op.GPULost)
81+
assert.True(t, op.GPURequiresReset)
82+
assert.True(t, op.FabricHealthUnhealthy)
83+
})
84+
85+
t.Run("driver major can be overwritten", func(t *testing.T) {
86+
t.Parallel()
87+
88+
op := &Op{}
89+
opts := []OpOption{
90+
WithDriverMajor(535),
91+
WithDriverMajor(550), // overwrite
92+
}
93+
op.applyOpts(opts)
94+
95+
assert.Equal(t, 550, op.DriverMajor)
96+
})
97+
}
98+
99+
func TestNvDeviceDriverMajorField(t *testing.T) {
100+
t.Parallel()
101+
102+
// Test that the driverMajor field is correctly set on nvDevice
103+
// We can't directly test nvDevice since it's private, but we can verify
104+
// the Op struct correctly stores the value
105+
106+
tests := []struct {
107+
name string
108+
driverMajor int
109+
expectV3API bool // whether V3 API should be attempted
110+
}{
111+
{
112+
name: "driver 535 should skip V3 API",
113+
driverMajor: 535,
114+
expectV3API: false,
115+
},
116+
{
117+
name: "driver 549 should skip V3 API",
118+
driverMajor: 549,
119+
expectV3API: false,
120+
},
121+
{
122+
name: "driver 550 should use V3 API",
123+
driverMajor: 550,
124+
expectV3API: true,
125+
},
126+
{
127+
name: "driver 560 should use V3 API",
128+
driverMajor: 560,
129+
expectV3API: true,
130+
},
131+
{
132+
name: "driver 0 (uninitialized) should skip V3 API",
133+
driverMajor: 0,
134+
expectV3API: false,
135+
},
136+
}
137+
138+
for _, tt := range tests {
139+
tt := tt
140+
t.Run(tt.name, func(t *testing.T) {
141+
t.Parallel()
142+
143+
// Verify the logic matches expectations
144+
shouldUseV3 := tt.driverMajor >= MinDriverVersionForV3FabricAPI
145+
assert.Equal(t, tt.expectV3API, shouldUseV3,
146+
"driver major %d: expected V3 API usage = %v", tt.driverMajor, tt.expectV3API)
147+
})
148+
}
149+
}
150+
151+
func TestDriverVersionBoundaryConditions(t *testing.T) {
152+
t.Parallel()
153+
154+
// Test boundary conditions around the minimum driver version
155+
tests := []struct {
156+
driverMajor int
157+
expectV3 bool
158+
description string
159+
}{
160+
{driverMajor: 548, expectV3: false, description: "two versions below minimum"},
161+
{driverMajor: 549, expectV3: false, description: "one version below minimum"},
162+
{driverMajor: 550, expectV3: true, description: "exactly at minimum"},
163+
{driverMajor: 551, expectV3: true, description: "one version above minimum"},
164+
{driverMajor: 552, expectV3: true, description: "two versions above minimum"},
165+
}
166+
167+
for _, tt := range tests {
168+
tt := tt
169+
t.Run(tt.description, func(t *testing.T) {
170+
t.Parallel()
171+
172+
shouldUseV3 := tt.driverMajor >= MinDriverVersionForV3FabricAPI
173+
require.Equal(t, tt.expectV3, shouldUseV3,
174+
"driver %d: V3 API should be %s",
175+
tt.driverMajor,
176+
map[bool]string{true: "enabled", false: "disabled"}[tt.expectV3])
177+
})
178+
}
179+
}

pkg/nvidia-query/nvml/device/fabric_state.go

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,20 @@ import (
55
"fmt"
66
"io"
77
"sort"
8+
"sync"
89

910
"github.com/NVIDIA/go-nvml/pkg/nvml"
1011
"github.com/olekukonko/tablewriter"
1112

13+
"github.com/leptonai/gpud/pkg/log"
1214
nvmlerrors "github.com/leptonai/gpud/pkg/nvidia/errors"
1315
)
1416

17+
var (
18+
// logV3SkipOnce ensures we only log the V3 API skip warning once
19+
logV3SkipOnce sync.Once
20+
)
21+
1522
// FabricState represents fabric state information for a GPU device.
1623
// This struct encapsulates all fabric-related data from V3 and V1 APIs.
1724
type FabricState struct {
@@ -274,21 +281,39 @@ func fabricTriStateStatus(val, notSupported, trueValue, falseValue uint32) strin
274281
// GetFabricState retrieves fabric state information from the device.
275282
// It attempts V3 API first for detailed health metrics, falling back to V1 API if needed.
276283
// This method properly handles GPU lost and reset required errors.
284+
//
285+
// IMPORTANT: The V3 API (nvmlDeviceGetGpuFabricInfoV) was introduced in driver 550.
286+
// Calling it on older drivers (e.g., 535.x) causes a symbol lookup error that crashes
287+
// the process. We skip V3 API calls for drivers < MinDriverVersionForV3FabricAPI.
288+
// See: https://docs.nvidia.com/deploy/nvml-api/change-log.html
277289
func (d *nvDevice) GetFabricState() (FabricState, error) {
278-
// Try V3 API first (provides detailed health metrics)
279-
handler := d.GetGpuFabricInfoV()
280-
info, ret := handler.V3()
281-
if ret == nvml.SUCCESS {
282-
return FabricState{
283-
CliqueID: info.CliqueId,
284-
ClusterUUID: formatClusterUUID(info.ClusterUuid),
285-
State: info.State,
286-
Status: nvml.Return(info.Status),
287-
HealthMask: info.HealthMask,
288-
HealthSummary: info.HealthSummary,
289-
}, nil
290+
// Only try V3 API if driver supports it (>= 550).
291+
// Calling GetGpuFabricInfoV() on older drivers (e.g., 535.x) causes:
292+
// "symbol lookup error: undefined symbol: nvmlDeviceGetGpuFabricInfoV"
293+
// which crashes the process (exit code 127) at the dynamic linker level.
294+
if d.driverMajor >= MinDriverVersionForV3FabricAPI {
295+
handler := d.GetGpuFabricInfoV()
296+
info, ret := handler.V3()
297+
if ret == nvml.SUCCESS {
298+
return FabricState{
299+
CliqueID: info.CliqueId,
300+
ClusterUUID: formatClusterUUID(info.ClusterUuid),
301+
State: info.State,
302+
Status: nvml.Return(info.Status),
303+
HealthMask: info.HealthMask,
304+
HealthSummary: info.HealthSummary,
305+
}, nil
306+
}
307+
// V3 failed, fall through to V1
308+
} else {
309+
// Log warning once when skipping V3 API due to old driver
310+
logV3SkipOnce.Do(func() {
311+
log.Logger.Warnw("skipping fabric state V3 API (nvmlDeviceGetGpuFabricInfoV) due to old driver version; V3 API requires driver >= 550",
312+
"driverMajor", d.driverMajor,
313+
"minRequired", MinDriverVersionForV3FabricAPI,
314+
)
315+
})
290316
}
291-
// V3 failed, fall through to V1
292317

293318
// Try V1 API (basic fabric information)
294319
infoV1, ret := d.GetGpuFabricInfo()

0 commit comments

Comments
 (0)