diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index 713a911..de947bf 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -125,14 +125,13 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { // TODO: Maybe use VDD_IN for Nano / NX and add a total system power chart powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`) + // jetson devices have only one gpu so we'll just initialize here + gpuData := &system.GPUData{Name: "GPU"} + gm.GpuDataMap["0"] = gpuData + return func(output []byte) bool { gm.Lock() defer gm.Unlock() - // we get gpu name from the intitial run of nvidia-smi, so return if it hasn't been initialized - gpuData, ok := gm.GpuDataMap["0"] - if !ok { - return true - } // Parse RAM usage ramMatches := ramPattern.FindSubmatch(output) if ramMatches != nil { @@ -184,12 +183,6 @@ func (gm *GPUManager) parseNvidiaData(output []byte) bool { if _, ok := gm.GpuDataMap[id]; !ok { name := strings.TrimPrefix(fields[1], "NVIDIA ") gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} - // check if tegrastats is active - if so we will only use nvidia-smi to get gpu name - // - nvidia-smi does not provide metrics for tegra / jetson devices - // this will end the nvidia-smi collector - if gm.tegrastats { - return false - } } // update gpu data gpu := gm.GpuDataMap[id] @@ -283,6 +276,7 @@ func (gm *GPUManager) detectGPUs() error { } if _, err := exec.LookPath(tegraStatsCmd); err == nil { gm.tegrastats = true + gm.nvidiaSmi = false } if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { return nil @@ -297,9 +291,11 @@ func (gm *GPUManager) startCollector(command string) { } switch command { case nvidiaSmiCmd: - collector.cmdArgs = []string{"-l", nvidiaSmiInterval, + collector.cmdArgs = []string{ + "-l", nvidiaSmiInterval, "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", - "--format=csv,noheader,nounits"} + "--format=csv,noheader,nounits", + } collector.parse = gm.parseNvidiaData go collector.start() case tegraStatsCmd: diff --git a/beszel/internal/agent/gpu_test.go b/beszel/internal/agent/gpu_test.go index b490617..a44943d 100644 --- a/beszel/internal/agent/gpu_test.go +++ b/beszel/internal/agent/gpu_test.go @@ -251,14 +251,13 @@ func TestParseJetsonData(t *testing.T) { tests := []struct { name string input string - gm *GPUManager wantMetrics *system.GPUData }{ { name: "valid data", input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% tj@52.468C VDD_GPU_SOC 2171mW", wantMetrics: &system.GPUData{ - Name: "Jetson", + Name: "GPU", MemoryUsed: 4300.0, MemoryTotal: 30698.0, Usage: 45.0, @@ -271,7 +270,7 @@ func TestParseJetsonData(t *testing.T) { name: "more valid data", input: "11-15-2024 08:38:09 RAM 6185/7620MB (lfb 8x2MB) SWAP 851/3810MB (cached 1MB) CPU [15%@729,11%@729,14%@729,13%@729,11%@729,8%@729] EMC_FREQ 43%@2133 GR3D_FREQ 63%@[621] NVDEC off NVJPG off NVJPG1 off VIC off OFA off APE 200 cpu@53.968C soc2@52.437C soc0@50.75C gpu@53.343C tj@53.968C soc1@51.656C VDD_IN 12479mW/12479mW VDD_CPU_GPU_CV 4667mW/4667mW VDD_SOC 2817mW/2817mW", wantMetrics: &system.GPUData{ - Name: "Jetson", + Name: "GPU", MemoryUsed: 6185.0, MemoryTotal: 7620.0, Usage: 63.0, @@ -284,7 +283,7 @@ func TestParseJetsonData(t *testing.T) { name: "missing temperature", input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", wantMetrics: &system.GPUData{ - Name: "Jetson", + Name: "GPU", MemoryUsed: 4300.0, MemoryTotal: 30698.0, Usage: 45.0, @@ -292,32 +291,18 @@ func TestParseJetsonData(t *testing.T) { Count: 1, }, }, - { - name: "no gpu defined by nvidia-smi", - input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", - gm: &GPUManager{ - GpuDataMap: map[string]*system.GPUData{}, - }, - }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if tt.gm != nil { - // should return if no gpu set by nvidia-smi - assert.Empty(t, tt.gm.GpuDataMap) - return + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), } - tt.gm = &GPUManager{ - GpuDataMap: map[string]*system.GPUData{ - "0": {Name: "Jetson"}, - }, - } - parser := tt.gm.getJetsonParser() + parser := gm.getJetsonParser() valid := parser([]byte(tt.input)) assert.Equal(t, true, valid) - got := tt.gm.GpuDataMap["0"] + got := gm.GpuDataMap["0"] require.NotNil(t, got) assert.Equal(t, tt.wantMetrics.Name, got.Name) assert.InDelta(t, tt.wantMetrics.MemoryUsed, got.MemoryUsed, 0.01) @@ -443,7 +428,7 @@ echo "test"` } return nil }, - wantNvidiaSmi: true, + wantNvidiaSmi: false, wantRocmSmi: true, wantTegrastats: true, wantErr: false,