From 521be05bc1496c9eb22dc817cd99e3d7d0bf8e33 Mon Sep 17 00:00:00 2001 From: henrygd Date: Thu, 13 Mar 2025 21:32:53 -0400 Subject: [PATCH] gpu.go refactoring and jetson fixes - Fixed usage and power values - Added new test cases - Moved some variables to constants --- beszel/internal/agent/gpu.go | 62 +++++--- beszel/internal/agent/gpu_test.go | 225 +++++++++++++++++++++++++++++- 2 files changed, 264 insertions(+), 23 deletions(-) diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index 71e3a66..713a911 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -16,6 +16,28 @@ import ( "golang.org/x/exp/slog" ) +const ( + // Commands + nvidiaSmiCmd = "nvidia-smi" + rocmSmiCmd = "rocm-smi" + tegraStatsCmd = "tegrastats" + + // Polling intervals + nvidiaSmiInterval = "4" // in seconds + tegraStatsInterval = "3700" // in milliseconds + rocmSmiInterval = 4300 * time.Millisecond + + // Command retry and timeout constants + retryWaitTime = 5 * time.Second + maxFailureRetries = 5 + + cmdBufferSize = 10 * 1024 + + // Unit Conversions + mebibytesInAMegabyte = 1.024 // nvidia-smi reports memory in MiB + milliwattsInAWatt = 1000.0 // tegrastats reports power in mW +) + // GPUManager manages data collection for GPUs (either Nvidia or AMD) type GPUManager struct { sync.Mutex @@ -57,7 +79,7 @@ func (c *gpuCollector) start() { break } slog.Warn(c.name+" failed, restarting", "err", err) - time.Sleep(time.Second * 5) + time.Sleep(retryWaitTime) continue } } @@ -76,7 +98,7 @@ func (c *gpuCollector) collect() error { scanner := bufio.NewScanner(stdout) if c.buf == nil { - c.buf = make([]byte, 0, 10*1024) + c.buf = make([]byte, 0, cmdBufferSize) } scanner.Buffer(c.buf, bufio.MaxScanTokenSize) @@ -120,7 +142,8 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { // Parse GR3D (GPU) usage gr3dMatches := gr3dPattern.FindSubmatch(output) if gr3dMatches != nil { - gpuData.Usage, _ = strconv.ParseFloat(string(gr3dMatches[1]), 64) + gr3dUsage, _ := strconv.ParseFloat(string(gr3dMatches[1]), 64) + gpuData.Usage += gr3dUsage } // Parse temperature tempMatches := tempPattern.FindSubmatch(output) @@ -131,7 +154,7 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { powerMatches := powerPattern.FindSubmatch(output) if powerMatches != nil { power, _ := strconv.ParseFloat(string(powerMatches[2]), 64) - gpuData.Power = power / 1000 + gpuData.Power += power / milliwattsInAWatt } gpuData.Count++ return true @@ -171,8 +194,8 @@ func (gm *GPUManager) parseNvidiaData(output []byte) bool { // update gpu data gpu := gm.GpuDataMap[id] gpu.Temperature = temp - gpu.MemoryUsed = memoryUsage / 1.024 - gpu.MemoryTotal = totalMemory / 1.024 + gpu.MemoryUsed = memoryUsage / mebibytesInAMegabyte + gpu.MemoryTotal = totalMemory / mebibytesInAMegabyte gpu.Usage += usage gpu.Power += power gpu.Count++ @@ -243,6 +266,7 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { } gpuData[id] = gpuCopy } + slog.Debug("GPU", "data", gpuData) return gpuData } @@ -251,13 +275,13 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { // tools are found. If none of the tools are found, it returns an error indicating that no GPU // management tools are available. func (gm *GPUManager) detectGPUs() error { - if _, err := exec.LookPath("nvidia-smi"); err == nil { + if _, err := exec.LookPath(nvidiaSmiCmd); err == nil { gm.nvidiaSmi = true } - if _, err := exec.LookPath("rocm-smi"); err == nil { + if _, err := exec.LookPath(rocmSmiCmd); err == nil { gm.rocmSmi = true } - if _, err := exec.LookPath("tegrastats"); err == nil { + if _, err := exec.LookPath(tegraStatsCmd); err == nil { gm.tegrastats = true } if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { @@ -272,17 +296,17 @@ func (gm *GPUManager) startCollector(command string) { name: command, } switch command { - case "nvidia-smi": - collector.cmdArgs = []string{"-l", "4", + case nvidiaSmiCmd: + collector.cmdArgs = []string{"-l", nvidiaSmiInterval, "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--format=csv,noheader,nounits"} collector.parse = gm.parseNvidiaData go collector.start() - case "tegrastats": - collector.cmdArgs = []string{"--interval", "3000"} + case tegraStatsCmd: + collector.cmdArgs = []string{"--interval", tegraStatsInterval} collector.parse = gm.getJetsonParser() go collector.start() - case "rocm-smi": + case rocmSmiCmd: collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"} collector.parse = gm.parseAmdData go func() { @@ -290,12 +314,12 @@ func (gm *GPUManager) startCollector(command string) { for { if err := collector.collect(); err != nil { failures++ - if failures > 5 { + if failures > maxFailureRetries { break } slog.Warn("Error collecting AMD GPU data", "err", err) } - time.Sleep(4300 * time.Millisecond) + time.Sleep(rocmSmiInterval) } }() } @@ -310,13 +334,13 @@ func NewGPUManager() (*GPUManager, error) { gm.GpuDataMap = make(map[string]*system.GPUData) if gm.nvidiaSmi { - gm.startCollector("nvidia-smi") + gm.startCollector(nvidiaSmiCmd) } if gm.rocmSmi { - gm.startCollector("rocm-smi") + gm.startCollector(rocmSmiCmd) } if gm.tegrastats { - gm.startCollector("tegrastats") + gm.startCollector(tegraStatsCmd) } return &gm, nil diff --git a/beszel/internal/agent/gpu_test.go b/beszel/internal/agent/gpu_test.go index d80aa3f..b490617 100644 --- a/beszel/internal/agent/gpu_test.go +++ b/beszel/internal/agent/gpu_test.go @@ -1,3 +1,6 @@ +//go:build testing +// +build testing + package agent import ( @@ -43,6 +46,52 @@ func TestParseNvidiaData(t *testing.T) { }, wantValid: true, }, + { + name: "more valid multi-gpu data", + input: `0, NVIDIA A10, 45, 19676, 23028, 0, 58.98 +1, NVIDIA A10, 45, 19638, 23028, 0, 62.35 +2, NVIDIA A10, 44, 21700, 23028, 0, 59.57 +3, NVIDIA A10, 45, 18222, 23028, 0, 61.76`, + wantData: map[string]system.GPUData{ + "0": { + Name: "A10", + Temperature: 45.0, + MemoryUsed: 19676.0 / 1.024, + MemoryTotal: 23028.0 / 1.024, + Usage: 0.0, + Power: 58.98, + Count: 1, + }, + "1": { + Name: "A10", + Temperature: 45.0, + MemoryUsed: 19638.0 / 1.024, + MemoryTotal: 23028.0 / 1.024, + Usage: 0.0, + Power: 62.35, + Count: 1, + }, + "2": { + Name: "A10", + Temperature: 44.0, + MemoryUsed: 21700.0 / 1.024, + MemoryTotal: 23028.0 / 1.024, + Usage: 0.0, + Power: 59.57, + Count: 1, + }, + "3": { + Name: "A10", + Temperature: 45.0, + MemoryUsed: 18222.0 / 1.024, + MemoryTotal: 23028.0 / 1.024, + Usage: 0.0, + Power: 61.76, + Count: 1, + }, + }, + wantValid: true, + }, { name: "empty input", input: "", @@ -207,7 +256,7 @@ func TestParseJetsonData(t *testing.T) { }{ { name: "valid data", - input: "RAM 4300/30698MB GR3D_FREQ 45% tj@52.468C VDD_GPU_SOC 2171mW", + input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% tj@52.468C VDD_GPU_SOC 2171mW", wantMetrics: &system.GPUData{ Name: "Jetson", MemoryUsed: 4300.0, @@ -218,9 +267,22 @@ func TestParseJetsonData(t *testing.T) { Count: 1, }, }, + { + name: "more valid data", + input: "11-15-2024 08:38:09 RAM 6185/7620MB (lfb 8x2MB) SWAP 851/3810MB (cached 1MB) CPU [15%@729,11%@729,14%@729,13%@729,11%@729,8%@729] EMC_FREQ 43%@2133 GR3D_FREQ 63%@[621] NVDEC off NVJPG off NVJPG1 off VIC off OFA off APE 200 cpu@53.968C soc2@52.437C soc0@50.75C gpu@53.343C tj@53.968C soc1@51.656C VDD_IN 12479mW/12479mW VDD_CPU_GPU_CV 4667mW/4667mW VDD_SOC 2817mW/2817mW", + wantMetrics: &system.GPUData{ + Name: "Jetson", + MemoryUsed: 6185.0, + MemoryTotal: 7620.0, + Usage: 63.0, + Temperature: 53.968, + Power: 4.667, + Count: 1, + }, + }, { name: "missing temperature", - input: "RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", + input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", wantMetrics: &system.GPUData{ Name: "Jetson", MemoryUsed: 4300.0, @@ -232,7 +294,7 @@ func TestParseJetsonData(t *testing.T) { }, { name: "no gpu defined by nvidia-smi", - input: "RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", + input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", gm: &GPUManager{ GpuDataMap: map[string]*system.GPUData{}, }, @@ -486,7 +548,7 @@ echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphi setup: func(t *testing.T) error { path := filepath.Join(dir, "tegrastats") script := `#!/bin/sh -echo "RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"` +echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"` if err := os.WriteFile(path, []byte(script), 0755); err != nil { return err } @@ -523,3 +585,158 @@ echo "RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"` }) } } + +// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types +func TestAccumulation(t *testing.T) { + type expectedGPUValues struct { + temperature float64 + memoryUsed float64 + memoryTotal float64 + usage float64 + power float64 + count float64 + avgUsage float64 + avgPower float64 + } + + tests := []struct { + name string + initialGPUData map[string]*system.GPUData + dataSamples [][]byte + parser func(*GPUManager) func([]byte) bool + expectedValues map[string]expectedGPUValues + }{ + { + name: "Jetson GPU accumulation", + initialGPUData: map[string]*system.GPUData{ + "0": { + Name: "Jetson", + Temperature: 0, + Usage: 0, + Power: 0, + Count: 0, + }, + }, + dataSamples: [][]byte{ + []byte("11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 30% tj@50.5C VDD_GPU_SOC 1000mW"), + []byte("11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 40% tj@60.5C VDD_GPU_SOC 1200mW"), + []byte("11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 50% tj@70.5C VDD_GPU_SOC 1400mW"), + }, + parser: func(gm *GPUManager) func([]byte) bool { + return gm.getJetsonParser() + }, + expectedValues: map[string]expectedGPUValues{ + "0": { + temperature: 70.5, // Last value + memoryUsed: 1024, // Last value + memoryTotal: 4096, // Last value + usage: 120.0, // Accumulated: 30 + 40 + 50 + power: 3.6, // Accumulated: 1.0 + 1.2 + 1.4 + count: 3, + avgUsage: 40.0, // 120 / 3 + avgPower: 1.2, // 3.6 / 3 + }, + }, + }, + { + name: "NVIDIA GPU accumulation", + initialGPUData: map[string]*system.GPUData{ + // NVIDIA parser will create the GPU data entries + }, + dataSamples: [][]byte{ + []byte("0, NVIDIA GeForce RTX 3080, 50, 5000, 10000, 30, 200"), + []byte("0, NVIDIA GeForce RTX 3080, 60, 6000, 10000, 40, 250"), + []byte("0, NVIDIA GeForce RTX 3080, 70, 7000, 10000, 50, 300"), + }, + parser: func(gm *GPUManager) func([]byte) bool { + return gm.parseNvidiaData + }, + expectedValues: map[string]expectedGPUValues{ + "0": { + temperature: 70.0, // Last value + memoryUsed: 7000.0 / 1.024, // Last value + memoryTotal: 10000.0 / 1.024, // Last value + usage: 120.0, // Accumulated: 30 + 40 + 50 + power: 750.0, // Accumulated: 200 + 250 + 300 + count: 3, + avgUsage: 40.0, // 120 / 3 + avgPower: 250.0, // 750 / 3 + }, + }, + }, + { + name: "AMD GPU accumulation", + initialGPUData: map[string]*system.GPUData{ + // AMD parser will create the GPU data entries + }, + dataSamples: [][]byte{ + []byte(`{"card0": {"GUID": "34756", "Temperature (Sensor edge) (C)": "50.0", "Current Socket Graphics Package Power (W)": "100.0", "GPU use (%)": "30", "VRAM Total Memory (B)": "10737418240", "VRAM Total Used Memory (B)": "1073741824", "Card Series": "Radeon RX 6800"}}`), + []byte(`{"card0": {"GUID": "34756", "Temperature (Sensor edge) (C)": "60.0", "Current Socket Graphics Package Power (W)": "150.0", "GPU use (%)": "40", "VRAM Total Memory (B)": "10737418240", "VRAM Total Used Memory (B)": "2147483648", "Card Series": "Radeon RX 6800"}}`), + []byte(`{"card0": {"GUID": "34756", "Temperature (Sensor edge) (C)": "70.0", "Current Socket Graphics Package Power (W)": "200.0", "GPU use (%)": "50", "VRAM Total Memory (B)": "10737418240", "VRAM Total Used Memory (B)": "3221225472", "Card Series": "Radeon RX 6800"}}`), + }, + parser: func(gm *GPUManager) func([]byte) bool { + return gm.parseAmdData + }, + expectedValues: map[string]expectedGPUValues{ + "34756": { + temperature: 70.0, // Last value + memoryUsed: 3221225472.0 / (1024 * 1024), // Last value + memoryTotal: 10737418240.0 / (1024 * 1024), // Last value + usage: 120.0, // Accumulated: 30 + 40 + 50 + power: 450.0, // Accumulated: 100 + 150 + 200 + count: 3, + avgUsage: 40.0, // 120 / 3 + avgPower: 150.0, // 450 / 3 + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a new GPUManager for each test + gm := &GPUManager{ + GpuDataMap: tt.initialGPUData, + } + + // Get the parser function + parser := tt.parser(gm) + + // Process each data sample + for i, sample := range tt.dataSamples { + valid := parser(sample) + assert.True(t, valid, "Sample %d should be valid", i) + } + + // Check accumulated values + for id, expected := range tt.expectedValues { + gpu, exists := gm.GpuDataMap[id] + assert.True(t, exists, "GPU with ID %s should exist", id) + if !exists { + continue + } + + assert.InDelta(t, expected.temperature, gpu.Temperature, 0.01, "Temperature should match") + assert.InDelta(t, expected.memoryUsed, gpu.MemoryUsed, 0.01, "Memory used should match") + assert.InDelta(t, expected.memoryTotal, gpu.MemoryTotal, 0.01, "Memory total should match") + assert.InDelta(t, expected.usage, gpu.Usage, 0.01, "Usage should match") + assert.InDelta(t, expected.power, gpu.Power, 0.01, "Power should match") + assert.Equal(t, expected.count, gpu.Count, "Count should match") + } + + // Verify average calculation in GetCurrentData + result := gm.GetCurrentData() + for id, expected := range tt.expectedValues { + gpu, exists := result[id] + assert.True(t, exists, "GPU with ID %s should exist in GetCurrentData result", id) + if !exists { + continue + } + + assert.InDelta(t, expected.temperature, gpu.Temperature, 0.01, "Temperature in GetCurrentData should match") + assert.InDelta(t, expected.avgUsage, gpu.Usage, 0.01, "Average usage in GetCurrentData should match") + assert.InDelta(t, expected.avgPower, gpu.Power, 0.01, "Average power in GetCurrentData should match") + } + }) + } +}