diff --git a/beszel/internal/agent/agent.go b/beszel/internal/agent/agent.go index 6117263..deb4c1a 100644 --- a/beszel/internal/agent/agent.go +++ b/beszel/internal/agent/agent.go @@ -24,6 +24,7 @@ type Agent struct { sensorsContext context.Context // Sensors context to override sys location sensorsWhitelist map[string]struct{} // List of sensors to monitor systemInfo system.Info // Host system info + gpuManager *GPUManager // Manages GPU data } func NewAgent() *Agent { @@ -74,6 +75,13 @@ func (a *Agent) Run(pubKey []byte, addr string) { a.initializeNetIoStats() a.dockerManager = newDockerManager() + // initialize GPU manager + if gm, err := NewGPUManager(); err != nil { + slog.Error("GPU manager", "err", err) + } else { + a.gpuManager = gm + } + // if debugging, print stats if a.debug { slog.Debug("Stats", "data", a.gatherStats()) diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go new file mode 100644 index 0000000..ae59272 --- /dev/null +++ b/beszel/internal/agent/gpu.go @@ -0,0 +1,234 @@ +package agent + +import ( + "beszel/internal/entities/system" + "bufio" + "encoding/json" + "fmt" + "os/exec" + "strconv" + "strings" + "sync" + "time" + + "golang.org/x/exp/slog" +) + +type GPUManager struct { + nvidiaSmi bool + rocmSmi bool + GpuDataMap map[string]*system.GPUData + mutex sync.Mutex +} + +type RocmSmiJson struct { + ID string `json:"Device ID"` + Name string `json:"Card series"` + Temperature string `json:"Temperature (Sensor edge) (C)"` + MemoryUsed string `json:"VRAM Total Used Memory (B)"` + MemoryTotal string `json:"VRAM Total Memory (B)"` + Usage string `json:"GPU use (%)"` + Power string `json:"Current Socket Graphics Package Power (W)"` +} + +// startNvidiaCollector oversees collectNvidiaStats and restarts nvidia-smi if it fails +func (gm *GPUManager) startNvidiaCollector() error { + for { + if err := gm.collectNvidiaStats(); err != nil { + slog.Warn("Restarting nvidia-smi", "err", err) + time.Sleep(time.Second) // Wait before retrying + continue + } + } +} + +// collectNvidiaStats runs nvidia-smi in a loop and passes the output to parseNvidiaData +func (gm *GPUManager) collectNvidiaStats() error { + // Set up the command + cmd := exec.Command("nvidia-smi", "-l", "4", "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--format=csv,noheader,nounits") + // Set up a pipe to capture stdout + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + // Start the command + if err := cmd.Start(); err != nil { + return err + } + // Use a scanner to read each line of output + scanner := bufio.NewScanner(stdout) + buf := make([]byte, 0, 64*1024) // 64KB buffer + scanner.Buffer(buf, bufio.MaxScanTokenSize) + for scanner.Scan() { + line := scanner.Bytes() + gm.parseNvidiaData(line) // Run your function on each new line + } + // Check for any errors encountered during scanning + if err := scanner.Err(); err != nil { + return err + } + // Wait for the command to complete + return cmd.Wait() +} + +// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map +func (gm *GPUManager) parseNvidiaData(output []byte) { + gm.mutex.Lock() + defer gm.mutex.Unlock() + lines := strings.Split(string(output), "\n") + for _, line := range lines { + if line != "" { + fields := strings.Split(line, ", ") + if len(fields) >= 7 { + id := fields[0] + temp, _ := strconv.ParseFloat(fields[2], 64) + memoryUsage, _ := strconv.ParseFloat(fields[3], 64) + totalMemory, _ := strconv.ParseFloat(fields[4], 64) + usage, _ := strconv.ParseFloat(fields[5], 64) + power, _ := strconv.ParseFloat(fields[6], 64) + // add gpu if not exists + if _, ok := gm.GpuDataMap[id]; !ok { + name := strings.TrimPrefix(fields[1], "NVIDIA ") + gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} + } + // update gpu data + gpu := gm.GpuDataMap[id] + gpu.Temperature += temp + gpu.MemoryUsed += memoryUsage + gpu.MemoryTotal += totalMemory + gpu.Usage += usage + gpu.Power += power + gpu.Count++ + } + } + } +} + +// startAmdCollector oversees collectAmdStats and restarts rocm-smi if it fails +func (gm *GPUManager) startAmdCollector() { + for { + if err := gm.collectAmdStats(); err != nil { + slog.Warn("Restarting rocm-smi", "err", err) + time.Sleep(time.Second) // Wait before retrying + continue + } else { + // break if no error (command runs but no card found) + break + } + } +} + +// collectAmdStats runs rocm-smi in a loop and passes the output to parseAmdData +func (gm *GPUManager) collectAmdStats() error { + cmd := exec.Command("/bin/sh", "-c", "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.7; done") + // Set up a pipe to capture stdout + stdout, err := cmd.StdoutPipe() + if err != nil { + return err + } + // Start the command + if err := cmd.Start(); err != nil { + return err + } + // Use a scanner to read each line of output + scanner := bufio.NewScanner(stdout) + buf := make([]byte, 0, 64*1024) // 64KB buffer + scanner.Buffer(buf, bufio.MaxScanTokenSize) + for scanner.Scan() { + var rocmSmiInfo map[string]RocmSmiJson + if err := json.Unmarshal(scanner.Bytes(), &rocmSmiInfo); err != nil { + return err + } + if len(rocmSmiInfo) > 0 { + // slog.Info("rocm-smi", "data", rocmSmiInfo) + gm.parseAmdData(&rocmSmiInfo) + } else { + slog.Warn("rocm-smi returned no GPU") + return nil + } + } + if err := scanner.Err(); err != nil { + return err + } + return cmd.Wait() +} + +// parseAmdData parses the output of rocm-smi and updates the GPUData map +func (gm *GPUManager) parseAmdData(rocmSmiInfo *map[string]RocmSmiJson) { + for _, v := range *rocmSmiInfo { + temp, _ := strconv.ParseFloat(v.Temperature, 64) + memoryUsage, _ := strconv.ParseFloat(v.MemoryUsed, 64) + totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64) + usage, _ := strconv.ParseFloat(v.Usage, 64) + power, _ := strconv.ParseFloat(v.Power, 64) + memoryUsage = bytesToMegabytes(memoryUsage) + totalMemory = bytesToMegabytes(totalMemory) + + if _, ok := gm.GpuDataMap[v.ID]; !ok { + gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name} + } + gpu := gm.GpuDataMap[v.ID] + gpu.Temperature += temp + gpu.MemoryUsed += memoryUsage + gpu.MemoryTotal += totalMemory + gpu.Usage += usage + gpu.Power += power + gpu.Count++ + } +} + +// sums and resets the current GPU utilization data since the last update +func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { + gm.mutex.Lock() + defer gm.mutex.Unlock() + // copy / reset the data + gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap)) + for id, gpu := range gm.GpuDataMap { + // sum the data + gpu.Temperature = twoDecimals(gpu.Temperature / gpu.Count) + gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed / gpu.Count) + gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal / gpu.Count) + gpu.Usage = twoDecimals(gpu.Usage / gpu.Count) + gpu.Power = twoDecimals(gpu.Power / gpu.Count) + gpuData[id] = *gpu + // reset the data + gpu.Temperature = 0 + gpu.MemoryUsed = 0 + gpu.MemoryTotal = 0 + gpu.Usage = 0 + gpu.Power = 0 + gpu.Count = 0 + } + return gpuData +} + +// detectGPU returns the GPU brand (nvidia or amd) or an error if none is found +// todo: make sure there's actually a GPU, not just if the command exists +func (gm *GPUManager) detectGPU() error { + if err := exec.Command("nvidia-smi").Run(); err == nil { + gm.nvidiaSmi = true + } + if err := exec.Command("rocm-smi").Run(); err == nil { + gm.rocmSmi = true + } + if gm.nvidiaSmi || gm.rocmSmi { + return nil + } + return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi") +} + +// NewGPUManager returns a new GPUManager +func NewGPUManager() (*GPUManager, error) { + var gm GPUManager + if err := gm.detectGPU(); err != nil { + return nil, err + } + gm.GpuDataMap = make(map[string]*system.GPUData, 1) + if gm.nvidiaSmi { + go gm.startNvidiaCollector() + } + if gm.rocmSmi { + go gm.startAmdCollector() + } + return &gm, nil +} diff --git a/beszel/internal/agent/system.go b/beszel/internal/agent/system.go index c709b35..9f6d65d 100644 --- a/beszel/internal/agent/system.go +++ b/beszel/internal/agent/system.go @@ -206,6 +206,22 @@ func (a *Agent) getSystemStats() system.Stats { } } + // GPU data + if a.gpuManager != nil { + if gpuData := a.gpuManager.GetCurrentData(); len(gpuData) > 0 { + systemStats.GPUData = gpuData + // add temperatures + if systemStats.Temperatures == nil { + systemStats.Temperatures = make(map[string]float64, len(gpuData)) + } + for _, gpu := range gpuData { + if gpu.Temperature > 0 { + systemStats.Temperatures[gpu.Name] = gpu.Temperature + } + } + } + } + // update base system info a.systemInfo.Cpu = systemStats.Cpu a.systemInfo.MemPct = systemStats.MemPct diff --git a/beszel/internal/alerts/alerts.go b/beszel/internal/alerts/alerts.go index 38e0eab..dfd4b2a 100644 --- a/beszel/internal/alerts/alerts.go +++ b/beszel/internal/alerts/alerts.go @@ -345,6 +345,7 @@ func (am *AlertManager) sendSystemAlert(alert SystemAlertData) { } } +// todo: allow x minutes downtime before sending alert func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *models.Record) error { var alertStatus string switch newStatus { diff --git a/beszel/internal/entities/system/system.go b/beszel/internal/entities/system/system.go index 9829212..c6f7363 100644 --- a/beszel/internal/entities/system/system.go +++ b/beszel/internal/entities/system/system.go @@ -28,6 +28,17 @@ type Stats struct { MaxNetworkRecv float64 `json:"nrm,omitempty"` Temperatures map[string]float64 `json:"t,omitempty"` ExtraFs map[string]*FsStats `json:"efs,omitempty"` + GPUData map[string]GPUData `json:"g,omitempty"` +} + +type GPUData struct { + Name string `json:"n"` + Temperature float64 `json:"-"` + MemoryUsed float64 `json:"mu,omitempty"` + MemoryTotal float64 `json:"mt,omitempty"` + Usage float64 `json:"u"` + Power float64 `json:"p,omitempty"` + Count float64 `json:"-"` } type FsStats struct { diff --git a/beszel/internal/records/records.go b/beszel/internal/records/records.go index 83b496f..bfa1d12 100644 --- a/beszel/internal/records/records.go +++ b/beszel/internal/records/records.go @@ -149,11 +149,7 @@ func (rm *RecordManager) CreateLongerRecords(collections []*models.Collection) { // Calculate the average stats of a list of system_stats records without reflect func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { - sum := system.Stats{ - Temperatures: make(map[string]float64), - ExtraFs: make(map[string]*system.FsStats), - } - + sum := system.Stats{} count := float64(len(records)) // use different counter for temps in case some records don't have them tempCount := float64(0) @@ -184,6 +180,9 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { sum.MaxDiskWritePs = max(sum.MaxDiskWritePs, stats.MaxDiskWritePs, stats.DiskWritePs) // add temps to sum if stats.Temperatures != nil { + if sum.Temperatures == nil { + sum.Temperatures = make(map[string]float64, len(stats.Temperatures)) + } tempCount++ for key, value := range stats.Temperatures { if _, ok := sum.Temperatures[key]; !ok { @@ -194,6 +193,9 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { } // add extra fs to sum if stats.ExtraFs != nil { + if sum.ExtraFs == nil { + sum.ExtraFs = make(map[string]*system.FsStats, len(stats.ExtraFs)) + } for key, value := range stats.ExtraFs { if _, ok := sum.ExtraFs[key]; !ok { sum.ExtraFs[key] = &system.FsStats{} @@ -207,6 +209,25 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { sum.ExtraFs[key].MaxDiskWritePS = max(sum.ExtraFs[key].MaxDiskWritePS, value.MaxDiskWritePS, value.DiskWritePs) } } + // add GPU data + if stats.GPUData != nil { + if sum.GPUData == nil { + sum.GPUData = make(map[string]system.GPUData, len(stats.GPUData)) + } + for id, value := range stats.GPUData { + if _, ok := sum.GPUData[id]; !ok { + sum.GPUData[id] = system.GPUData{Name: value.Name} + } + gpu := sum.GPUData[id] + gpu.Temperature += value.Temperature + gpu.MemoryUsed += value.MemoryUsed + gpu.MemoryTotal += value.MemoryTotal + gpu.Usage += value.Usage + gpu.Power += value.Power + gpu.Count += value.Count + sum.GPUData[id] = gpu + } + } } stats = system.Stats{ @@ -232,14 +253,14 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { MaxNetworkRecv: sum.MaxNetworkRecv, } - if len(sum.Temperatures) != 0 { + if sum.Temperatures != nil { stats.Temperatures = make(map[string]float64, len(sum.Temperatures)) for key, value := range sum.Temperatures { stats.Temperatures[key] = twoDecimals(value / tempCount) } } - if len(sum.ExtraFs) != 0 { + if sum.ExtraFs != nil { stats.ExtraFs = make(map[string]*system.FsStats, len(sum.ExtraFs)) for key, value := range sum.ExtraFs { stats.ExtraFs[key] = &system.FsStats{ @@ -253,6 +274,21 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { } } + if sum.GPUData != nil { + stats.GPUData = make(map[string]system.GPUData, len(sum.GPUData)) + for id, value := range sum.GPUData { + stats.GPUData[id] = system.GPUData{ + Name: value.Name, + Temperature: twoDecimals(value.Temperature / count), + MemoryUsed: twoDecimals(value.MemoryUsed / count), + MemoryTotal: twoDecimals(value.MemoryTotal / count), + Usage: twoDecimals(value.Usage / count), + Power: twoDecimals(value.Power / count), + Count: twoDecimals(value.Count / count), + } + } + } + return stats }