From 6b41a98338ba3c2412a9a2d6b2d7d1d4cc498a6d Mon Sep 17 00:00:00 2001 From: henrygd Date: Fri, 21 Feb 2025 00:56:40 -0500 Subject: [PATCH] gpu: add tests and refactor to support amd on windows --- beszel/internal/agent/gpu.go | 157 +++++----- beszel/internal/agent/gpu_test.go | 476 ++++++++++++++++++++++++++++++ 2 files changed, 557 insertions(+), 76 deletions(-) create mode 100644 beszel/internal/agent/gpu_test.go diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index 7cc3c9e..8f885f9 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -17,11 +17,11 @@ import ( // GPUManager manages data collection for GPUs (either Nvidia or AMD) type GPUManager struct { + sync.Mutex nvidiaSmi bool rocmSmi bool tegrastats bool GpuDataMap map[string]*system.GPUData - mutex sync.Mutex } // RocmSmiJson represents the JSON structure of rocm-smi output @@ -38,9 +38,10 @@ type RocmSmiJson struct { // gpuCollector defines a collector for a specific GPU management utility (nvidia-smi or rocm-smi) type gpuCollector struct { - name string - cmd *exec.Cmd - parse func([]byte) bool // returns true if valid data was found + name string + cmdArgs []string + parse func([]byte) bool // returns true if valid data was found + buf []byte } var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data @@ -63,17 +64,20 @@ func (c *gpuCollector) start() { // collect executes the command, parses output with the assigned parser function func (c *gpuCollector) collect() error { - stdout, err := c.cmd.StdoutPipe() + cmd := exec.Command(c.name, c.cmdArgs...) + stdout, err := cmd.StdoutPipe() if err != nil { return err } - if err := c.cmd.Start(); err != nil { + if err := cmd.Start(); err != nil { return err } scanner := bufio.NewScanner(stdout) - buf := make([]byte, 0, 8*1024) - scanner.Buffer(buf, bufio.MaxScanTokenSize) + if c.buf == nil { + c.buf = make([]byte, 0, 4*1024) + } + scanner.Buffer(c.buf, bufio.MaxScanTokenSize) for scanner.Scan() { hasValidData := c.parse(scanner.Bytes()) @@ -85,7 +89,7 @@ func (c *gpuCollector) collect() error { if err := scanner.Err(); err != nil { return fmt.Errorf("scanner error: %w", err) } - return c.cmd.Wait() + return cmd.Wait() } // getJetsonParser returns a function to parse the output of tegrastats and update the GPUData map @@ -99,8 +103,8 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`) return func(output []byte) bool { - gm.mutex.Lock() - defer gm.mutex.Unlock() + gm.Lock() + defer gm.Unlock() // we get gpu name from the intitial run of nvidia-smi, so return if it hasn't been initialized gpuData, ok := gm.GpuDataMap["0"] if !ok { @@ -126,7 +130,7 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { // Parse power usage powerMatches := powerPattern.FindStringSubmatch(data) if powerMatches != nil { - power, _ := strconv.ParseFloat(powerMatches[1], 64) + power, _ := strconv.ParseFloat(powerMatches[2], 64) gpuData.Power = power / 1000 } gpuData.Count++ @@ -136,46 +140,42 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { // parseNvidiaData parses the output of nvidia-smi and updates the GPUData map func (gm *GPUManager) parseNvidiaData(output []byte) bool { - fields := strings.Split(string(output), ", ") - if len(fields) < 7 { - return false - } - gm.mutex.Lock() - defer gm.mutex.Unlock() - lines := strings.Split(string(output), "\n") - for _, line := range lines { - if line != "" { - fields := strings.Split(line, ", ") - if len(fields) >= 7 { - id := fields[0] - temp, _ := strconv.ParseFloat(fields[2], 64) - memoryUsage, _ := strconv.ParseFloat(fields[3], 64) - totalMemory, _ := strconv.ParseFloat(fields[4], 64) - usage, _ := strconv.ParseFloat(fields[5], 64) - power, _ := strconv.ParseFloat(fields[6], 64) - // add gpu if not exists - if _, ok := gm.GpuDataMap[id]; !ok { - name := strings.TrimPrefix(fields[1], "NVIDIA ") - gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} - // check if tegrastats is active - if so we will only use nvidia-smi to get gpu name - // - nvidia-smi does not provide metrics for tegra / jetson devices - // this will end the nvidia-smi collector - if gm.tegrastats { - return false - } - } - // update gpu data - gpu := gm.GpuDataMap[id] - gpu.Temperature = temp - gpu.MemoryUsed = memoryUsage / 1.024 - gpu.MemoryTotal = totalMemory / 1.024 - gpu.Usage += usage - gpu.Power += power - gpu.Count++ + gm.Lock() + defer gm.Unlock() + var valid bool + for line := range strings.Lines(string(output)) { + fields := strings.Split(strings.TrimSpace(line), ", ") + if len(fields) < 7 { + continue + } + valid = true + id := fields[0] + temp, _ := strconv.ParseFloat(fields[2], 64) + memoryUsage, _ := strconv.ParseFloat(fields[3], 64) + totalMemory, _ := strconv.ParseFloat(fields[4], 64) + usage, _ := strconv.ParseFloat(fields[5], 64) + power, _ := strconv.ParseFloat(fields[6], 64) + // add gpu if not exists + if _, ok := gm.GpuDataMap[id]; !ok { + name := strings.TrimPrefix(fields[1], "NVIDIA ") + gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} + // check if tegrastats is active - if so we will only use nvidia-smi to get gpu name + // - nvidia-smi does not provide metrics for tegra / jetson devices + // this will end the nvidia-smi collector + if gm.tegrastats { + return false } } + // update gpu data + gpu := gm.GpuDataMap[id] + gpu.Temperature = temp + gpu.MemoryUsed = memoryUsage / 1.024 + gpu.MemoryTotal = totalMemory / 1.024 + gpu.Usage += usage + gpu.Power += power + gpu.Count++ } - return true + return valid } // parseAmdData parses the output of rocm-smi and updates the GPUData map @@ -184,8 +184,8 @@ func (gm *GPUManager) parseAmdData(output []byte) bool { if err := json.Unmarshal(output, &rocmSmiInfo); err != nil || len(rocmSmiInfo) == 0 { return false } - gm.mutex.Lock() - defer gm.mutex.Unlock() + gm.Lock() + defer gm.Unlock() for _, v := range rocmSmiInfo { var power float64 if v.PowerPackage != "" { @@ -213,8 +213,8 @@ func (gm *GPUManager) parseAmdData(output []byte) bool { // sums and resets the current GPU utilization data since the last update func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { - gm.mutex.Lock() - defer gm.mutex.Unlock() + gm.Lock() + defer gm.Unlock() // check for GPUs with the same name nameCounts := make(map[string]int) @@ -266,31 +266,36 @@ func (gm *GPUManager) detectGPUs() error { // startCollector starts the appropriate GPU data collector based on the command func (gm *GPUManager) startCollector(command string) { + collector := gpuCollector{ + name: command, + } switch command { case "nvidia-smi": - nvidia := gpuCollector{ - name: "nvidia-smi", - cmd: exec.Command("nvidia-smi", "-l", "4", - "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", - "--format=csv,noheader,nounits"), - parse: gm.parseNvidiaData, - } - go nvidia.start() - case "rocm-smi": - amdCollector := gpuCollector{ - name: "rocm-smi", - cmd: exec.Command("/bin/sh", "-c", - "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.3; done"), - parse: gm.parseAmdData, - } - go amdCollector.start() + collector.cmdArgs = []string{"-l", "4", + "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", + "--format=csv,noheader,nounits"} + collector.parse = gm.parseNvidiaData + go collector.start() case "tegrastats": - jetsonCollector := gpuCollector{ - name: "tegrastats", - cmd: exec.Command("tegrastats", "--interval", "3000"), - parse: gm.getJetsonParser(), - } - go jetsonCollector.start() + collector.cmdArgs = []string{"--interval", "3000"} + collector.parse = gm.getJetsonParser() + go collector.start() + case "rocm-smi": + collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"} + collector.parse = gm.parseAmdData + go func() { + failures := 0 + for { + if err := collector.collect(); err != nil { + failures++ + if failures > 5 { + break + } + slog.Warn("Error collecting AMD GPU data", "err", err) + } + time.Sleep(4300 * time.Millisecond) + } + }() } } @@ -300,7 +305,7 @@ func NewGPUManager() (*GPUManager, error) { if err := gm.detectGPUs(); err != nil { return nil, err } - gm.GpuDataMap = make(map[string]*system.GPUData, 1) + gm.GpuDataMap = make(map[string]*system.GPUData) if gm.nvidiaSmi { gm.startCollector("nvidia-smi") diff --git a/beszel/internal/agent/gpu_test.go b/beszel/internal/agent/gpu_test.go new file mode 100644 index 0000000..01dfee6 --- /dev/null +++ b/beszel/internal/agent/gpu_test.go @@ -0,0 +1,476 @@ +package agent + +import ( + "beszel/internal/entities/system" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseNvidiaData(t *testing.T) { + tests := []struct { + name string + input string + wantData map[string]system.GPUData + wantValid bool + }{ + { + name: "valid multi-gpu data", + input: "0, NVIDIA GeForce RTX 3050 Ti Laptop GPU, 48, 12, 4096, 26.3, 12.73\n1, NVIDIA A100-PCIE-40GB, 38, 74, 40960, [N/A], 36.79", + wantData: map[string]system.GPUData{ + "0": { + Name: "GeForce RTX 3050 Ti", + Temperature: 48.0, + MemoryUsed: 12.0 / 1.024, + MemoryTotal: 4096.0 / 1.024, + Usage: 26.3, + Power: 12.73, + Count: 1, + }, + "1": { + Name: "A100-PCIE-40GB", + Temperature: 38.0, + MemoryUsed: 74.0 / 1.024, + MemoryTotal: 40960.0 / 1.024, + Usage: 0.0, + Power: 36.79, + Count: 1, + }, + }, + wantValid: true, + }, + { + name: "empty input", + input: "", + wantData: map[string]system.GPUData{}, + wantValid: false, + }, + { + name: "malformed data", + input: "bad, data, here", + wantData: map[string]system.GPUData{}, + wantValid: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + valid := gm.parseNvidiaData([]byte(tt.input)) + assert.Equal(t, tt.wantValid, valid) + + if tt.wantValid { + for id, want := range tt.wantData { + got := gm.GpuDataMap[id] + require.NotNil(t, got) + assert.Equal(t, want.Name, got.Name) + assert.InDelta(t, want.Temperature, got.Temperature, 0.01) + assert.InDelta(t, want.MemoryUsed, got.MemoryUsed, 0.01) + assert.InDelta(t, want.MemoryTotal, got.MemoryTotal, 0.01) + assert.InDelta(t, want.Usage, got.Usage, 0.01) + assert.InDelta(t, want.Power, got.Power, 0.01) + assert.Equal(t, want.Count, got.Count) + } + } + }) + } +} + +func TestParseAmdData(t *testing.T) { + tests := []struct { + name string + input string + wantData map[string]system.GPUData + wantValid bool + }{ + { + name: "valid single gpu data", + input: `{ + "card0": { + "GUID": "34756", + "Temperature (Sensor edge) (C)": "47.0", + "Current Socket Graphics Package Power (W)": "9.215", + "GPU use (%)": "0", + "VRAM Total Memory (B)": "536870912", + "VRAM Total Used Memory (B)": "482263040", + "Card Series": "Rembrandt [Radeon 680M]" + } + }`, + wantData: map[string]system.GPUData{ + "34756": { + Name: "Rembrandt [Radeon 680M]", + Temperature: 47.0, + MemoryUsed: 482263040.0 / (1024 * 1024), + MemoryTotal: 536870912.0 / (1024 * 1024), + Usage: 0.0, + Power: 9.215, + Count: 1, + }, + }, + wantValid: true, + }, + { + name: "invalid json", + input: "{bad json", + wantData: map[string]system.GPUData{}, + wantValid: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gm := &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + valid := gm.parseAmdData([]byte(tt.input)) + assert.Equal(t, tt.wantValid, valid) + + if tt.wantValid { + for id, want := range tt.wantData { + got := gm.GpuDataMap[id] + require.NotNil(t, got) + assert.Equal(t, want.Name, got.Name) + assert.InDelta(t, want.Temperature, got.Temperature, 0.01) + assert.InDelta(t, want.MemoryUsed, got.MemoryUsed, 0.01) + assert.InDelta(t, want.MemoryTotal, got.MemoryTotal, 0.01) + assert.InDelta(t, want.Usage, got.Usage, 0.01) + assert.InDelta(t, want.Power, got.Power, 0.01) + assert.Equal(t, want.Count, got.Count) + } + } + }) + } +} + +func TestParseJetsonData(t *testing.T) { + tests := []struct { + name string + input string + gm *GPUManager + wantMetrics *system.GPUData + }{ + { + name: "valid data", + input: "RAM 4300/30698MB GR3D_FREQ 45% tj@52.468C VDD_GPU_SOC 2171mW", + wantMetrics: &system.GPUData{ + Name: "Jetson", + MemoryUsed: 4300.0, + MemoryTotal: 30698.0, + Usage: 45.0, + Temperature: 52.468, + Power: 2.171, + Count: 1, + }, + }, + { + name: "missing temperature", + input: "RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", + wantMetrics: &system.GPUData{ + Name: "Jetson", + MemoryUsed: 4300.0, + MemoryTotal: 30698.0, + Usage: 45.0, + Power: 2.171, + Count: 1, + }, + }, + { + name: "no gpu defined by nvidia-smi", + input: "RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", + gm: &GPUManager{ + GpuDataMap: map[string]*system.GPUData{}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.gm != nil { + // should return if no gpu set by nvidia-smi + assert.Empty(t, tt.gm.GpuDataMap) + return + } + tt.gm = &GPUManager{ + GpuDataMap: map[string]*system.GPUData{ + "0": {Name: "Jetson"}, + }, + } + parser := tt.gm.getJetsonParser() + valid := parser([]byte(tt.input)) + assert.Equal(t, true, valid) + + got := tt.gm.GpuDataMap["0"] + require.NotNil(t, got) + assert.Equal(t, tt.wantMetrics.Name, got.Name) + assert.InDelta(t, tt.wantMetrics.MemoryUsed, got.MemoryUsed, 0.01) + assert.InDelta(t, tt.wantMetrics.MemoryTotal, got.MemoryTotal, 0.01) + assert.InDelta(t, tt.wantMetrics.Usage, got.Usage, 0.01) + if tt.wantMetrics.Temperature > 0 { + assert.InDelta(t, tt.wantMetrics.Temperature, got.Temperature, 0.01) + } + assert.InDelta(t, tt.wantMetrics.Power, got.Power, 0.01) + assert.Equal(t, tt.wantMetrics.Count, got.Count) + }) + } +} + +func TestGetCurrentData(t *testing.T) { + gm := &GPUManager{ + GpuDataMap: map[string]*system.GPUData{ + "0": { + Name: "GPU1", + Temperature: 50, + MemoryUsed: 2048, + MemoryTotal: 4096, + Usage: 100, // 100 over 2 counts = 50 avg + Power: 200, // 200 over 2 counts = 100 avg + Count: 2, + }, + "1": { + Name: "GPU1", + Temperature: 60, + MemoryUsed: 3072, + MemoryTotal: 8192, + Usage: 30, + Power: 60, + Count: 1, + }, + }, + } + + result := gm.GetCurrentData() + + // Verify name disambiguation + assert.Equal(t, "GPU1 0", result["0"].Name) + assert.Equal(t, "GPU1 1", result["1"].Name) + + // Check averaged values + assert.InDelta(t, 50.0, result["0"].Usage, 0.01) + assert.InDelta(t, 100.0, result["0"].Power, 0.01) + assert.InDelta(t, 30.0, result["1"].Usage, 0.01) + assert.InDelta(t, 60.0, result["1"].Power, 0.01) + + // Verify reset counts + assert.Equal(t, float64(1), gm.GpuDataMap["0"].Count) + assert.Equal(t, float64(1), gm.GpuDataMap["1"].Count) +} + +func TestDetectGPUs(t *testing.T) { + // Save original PATH + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + // Set up temp dir with the commands + tempDir := t.TempDir() + os.Setenv("PATH", tempDir) + + tests := []struct { + name string + setupCommands func() error + wantNvidiaSmi bool + wantRocmSmi bool + wantTegrastats bool + wantErr bool + }{ + { + name: "nvidia-smi not available", + setupCommands: func() error { + return nil + }, + wantNvidiaSmi: false, + wantRocmSmi: false, + wantTegrastats: false, + wantErr: true, + }, + { + name: "nvidia-smi available", + setupCommands: func() error { + path := filepath.Join(tempDir, "nvidia-smi") + script := `#!/bin/sh +echo "test"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + wantNvidiaSmi: true, + wantTegrastats: false, + wantRocmSmi: false, + wantErr: false, + }, + { + name: "rocm-smi available", + setupCommands: func() error { + path := filepath.Join(tempDir, "rocm-smi") + script := `#!/bin/sh +echo "test"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + wantNvidiaSmi: true, + wantRocmSmi: true, + wantTegrastats: false, + wantErr: false, + }, + { + name: "tegrastats available", + setupCommands: func() error { + path := filepath.Join(tempDir, "tegrastats") + script := `#!/bin/sh +echo "test"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + wantNvidiaSmi: true, + wantRocmSmi: true, + wantTegrastats: true, + wantErr: false, + }, + { + name: "no gpu tools available", + setupCommands: func() error { + os.Setenv("PATH", "") + return nil + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if err := tt.setupCommands(); err != nil { + t.Fatal(err) + } + + gm := &GPUManager{} + err := gm.detectGPUs() + + t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats) + + if tt.wantErr { + assert.Error(t, err) + return + } + + assert.NoError(t, err) + assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi) + assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi) + assert.Equal(t, tt.wantTegrastats, gm.tegrastats) + }) + } +} + +func TestStartCollector(t *testing.T) { + // Save original PATH + origPath := os.Getenv("PATH") + defer os.Setenv("PATH", origPath) + + // Set up temp dir with the commands + dir := t.TempDir() + os.Setenv("PATH", dir) + + tests := []struct { + name string + command string + setup func(t *testing.T) error + validate func(t *testing.T, gm *GPUManager) + gm *GPUManager + }{ + { + name: "nvidia-smi collector", + command: "nvidia-smi", + setup: func(t *testing.T) error { + path := filepath.Join(dir, "nvidia-smi") + script := `#!/bin/sh +echo "0, NVIDIA Test GPU, 50, 1024, 4096, 25, 100"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + validate: func(t *testing.T, gm *GPUManager) { + gpu, exists := gm.GpuDataMap["0"] + assert.True(t, exists) + if exists { + assert.Equal(t, "Test GPU", gpu.Name) + assert.Equal(t, 50.0, gpu.Temperature) + + } + }, + }, + { + name: "rocm-smi collector", + command: "rocm-smi", + setup: func(t *testing.T) error { + path := filepath.Join(dir, "rocm-smi") + script := `#!/bin/sh +echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "Card Model": "0x1681", "Card Vendor": "Advanced Micro Devices, Inc. [AMD/ATI]", "Card SKU": "REMBRANDT", "Subsystem ID": "0x8a22", "Device Rev": "0xc8", "Node ID": "1", "GUID": "34756", "GFX Version": "gfx1035"}}'` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + validate: func(t *testing.T, gm *GPUManager) { + gpu, exists := gm.GpuDataMap["34756"] + assert.True(t, exists) + if exists { + assert.Equal(t, "Rembrandt [Radeon 680M]", gpu.Name) + assert.InDelta(t, 49.0, gpu.Temperature, 0.01) + assert.InDelta(t, 28.159, gpu.Power, 0.01) + } + }, + }, + { + name: "tegrastats collector", + command: "tegrastats", + setup: func(t *testing.T) error { + path := filepath.Join(dir, "tegrastats") + script := `#!/bin/sh +echo "RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"` + if err := os.WriteFile(path, []byte(script), 0755); err != nil { + return err + } + return nil + }, + validate: func(t *testing.T, gm *GPUManager) { + gpu, exists := gm.GpuDataMap["0"] + assert.True(t, exists) + if exists { + assert.InDelta(t, 70.0, gpu.Temperature, 0.1) + } + }, + gm: &GPUManager{ + GpuDataMap: map[string]*system.GPUData{ + "0": {}, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.setup(t); err != nil { + t.Fatal(err) + } + if tt.gm == nil { + tt.gm = &GPUManager{ + GpuDataMap: make(map[string]*system.GPUData), + } + } + tt.gm.startCollector(tt.command) + time.Sleep(50 * time.Millisecond) // Give collector time to run + tt.validate(t, tt.gm) + }) + } +}