Probable fix for Jetson gpu issue (#895)

This commit is contained in:
henrygd
2025-06-26 22:11:48 -04:00
parent 8c52f30a71
commit 4395520a28
2 changed files with 106 additions and 45 deletions

View File

@@ -243,21 +243,26 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
// copy / reset the data // copy / reset the data
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap)) gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
for id, gpu := range gm.GpuDataMap { for id, gpu := range gm.GpuDataMap {
// sum the data var gpuAvg system.GPUData
gpu.Temperature = twoDecimals(gpu.Temperature)
gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed) gpuAvg.Temperature = twoDecimals(gpu.Temperature)
gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal) gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
gpu.Usage = twoDecimals(gpu.Usage / gpu.Count) gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
gpu.Power = twoDecimals(gpu.Power / gpu.Count)
// reset the count // avoid division by zero
gpu.Count = 1 if gpu.Count > 0 {
// dereference to avoid overwriting anything else gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpuCopy := *gpu gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
}
// reset accumulators in the original
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
// append id to the name if there are multiple GPUs with the same name // append id to the name if there are multiple GPUs with the same name
if nameCounts[gpu.Name] > 1 { if nameCounts[gpu.Name] > 1 {
gpuCopy.Name = fmt.Sprintf("%s %s", gpu.Name, id) gpuAvg.Name = fmt.Sprintf("%s %s", gpu.Name, id)
} }
gpuData[id] = gpuCopy gpuData[id] = gpuAvg
} }
slog.Debug("GPU", "data", gpuData) slog.Debug("GPU", "data", gpuData)
return gpuData return gpuData

View File

@@ -279,6 +279,19 @@ func TestParseJetsonData(t *testing.T) {
Count: 1, Count: 1,
}, },
}, },
{
name: "orin nano",
input: "06-18-2025 11:25:24 RAM 3452/7620MB (lfb 25x4MB) SWAP 1518/16384MB (cached 174MB) CPU [1%@1420,2%@1420,0%@1420,2%@1420,2%@729,1%@729] GR3D_FREQ 0% cpu@50.031C soc2@49.031C soc0@50C gpu@49.031C tj@50.25C soc1@50.25C VDD_IN 4824mW/4824mW VDD_CPU_GPU_CV 518mW/518mW VDD_SOC 1475mW/1475mW",
wantMetrics: &system.GPUData{
Name: "GPU",
MemoryUsed: 3452.0,
MemoryTotal: 7620.0,
Usage: 0.0,
Temperature: 50.25,
Power: 0.518,
Count: 1,
},
},
{ {
name: "missing temperature", name: "missing temperature",
input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW",
@@ -318,6 +331,7 @@ func TestParseJetsonData(t *testing.T) {
} }
func TestGetCurrentData(t *testing.T) { func TestGetCurrentData(t *testing.T) {
t.Run("calculates averages and resets accumulators", func(t *testing.T) {
gm := &GPUManager{ gm := &GPUManager{
GpuDataMap: map[string]*system.GPUData{ GpuDataMap: map[string]*system.GPUData{
"0": { "0": {
@@ -347,15 +361,45 @@ func TestGetCurrentData(t *testing.T) {
assert.Equal(t, "GPU1 0", result["0"].Name) assert.Equal(t, "GPU1 0", result["0"].Name)
assert.Equal(t, "GPU1 1", result["1"].Name) assert.Equal(t, "GPU1 1", result["1"].Name)
// Check averaged values // Check averaged values in the result
assert.InDelta(t, 50.0, result["0"].Usage, 0.01) assert.InDelta(t, 50.0, result["0"].Usage, 0.01)
assert.InDelta(t, 100.0, result["0"].Power, 0.01) assert.InDelta(t, 100.0, result["0"].Power, 0.01)
assert.InDelta(t, 30.0, result["1"].Usage, 0.01) assert.InDelta(t, 30.0, result["1"].Usage, 0.01)
assert.InDelta(t, 60.0, result["1"].Power, 0.01) assert.InDelta(t, 60.0, result["1"].Power, 0.01)
// Verify reset counts // Verify that accumulators in the original map are reset
assert.Equal(t, float64(1), gm.GpuDataMap["0"].Count) assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count, "GPU 0 Count should be reset")
assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count) assert.Equal(t, float64(0), gm.GpuDataMap["0"].Usage, "GPU 0 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Power, "GPU 0 Power should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Count, "GPU 1 Count should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Usage, "GPU 1 Usage should be reset")
assert.Equal(t, float64(0), gm.GpuDataMap["1"].Power, "GPU 1 Power should be reset")
})
t.Run("handles zero count without panicking", func(t *testing.T) {
gm := &GPUManager{
GpuDataMap: map[string]*system.GPUData{
"0": {
Name: "TestGPU",
Count: 0,
Usage: 0,
Power: 0,
},
},
}
var result map[string]system.GPUData
assert.NotPanics(t, func() {
result = gm.GetCurrentData()
})
// Check that usage and power are 0
assert.Equal(t, 0.0, result["0"].Usage)
assert.Equal(t, 0.0, result["0"].Power)
// Verify reset count
assert.Equal(t, float64(0), gm.GpuDataMap["0"].Count)
})
} }
func TestDetectGPUs(t *testing.T) { func TestDetectGPUs(t *testing.T) {
@@ -722,6 +766,18 @@ func TestAccumulation(t *testing.T) {
assert.InDelta(t, expected.avgUsage, gpu.Usage, 0.01, "Average usage in GetCurrentData should match") assert.InDelta(t, expected.avgUsage, gpu.Usage, 0.01, "Average usage in GetCurrentData should match")
assert.InDelta(t, expected.avgPower, gpu.Power, 0.01, "Average power in GetCurrentData should match") assert.InDelta(t, expected.avgPower, gpu.Power, 0.01, "Average power in GetCurrentData should match")
} }
// Verify that accumulators in the original map are reset
for id := range tt.expectedValues {
gpu, exists := gm.GpuDataMap[id]
assert.True(t, exists, "GPU with ID %s should still exist after GetCurrentData", id)
if !exists {
continue
}
assert.Equal(t, float64(0), gpu.Count, "Count should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Usage, "Usage should be reset for GPU ID %s", id)
assert.Equal(t, float64(0), gpu.Power, "Power should be reset for GPU ID %s", id)
}
}) })
} }
} }