progress on gpu metrics

This commit is contained in:
Henry Dollman
2024-11-08 16:52:50 -05:00
parent b433437636
commit 8262a9a45b
6 changed files with 313 additions and 7 deletions

View File

@@ -24,6 +24,7 @@ type Agent struct {
sensorsContext context.Context // Sensors context to override sys location sensorsContext context.Context // Sensors context to override sys location
sensorsWhitelist map[string]struct{} // List of sensors to monitor sensorsWhitelist map[string]struct{} // List of sensors to monitor
systemInfo system.Info // Host system info systemInfo system.Info // Host system info
gpuManager *GPUManager // Manages GPU data
} }
func NewAgent() *Agent { func NewAgent() *Agent {
@@ -74,6 +75,13 @@ func (a *Agent) Run(pubKey []byte, addr string) {
a.initializeNetIoStats() a.initializeNetIoStats()
a.dockerManager = newDockerManager() a.dockerManager = newDockerManager()
// initialize GPU manager
if gm, err := NewGPUManager(); err != nil {
slog.Error("GPU manager", "err", err)
} else {
a.gpuManager = gm
}
// if debugging, print stats // if debugging, print stats
if a.debug { if a.debug {
slog.Debug("Stats", "data", a.gatherStats()) slog.Debug("Stats", "data", a.gatherStats())

View File

@@ -0,0 +1,234 @@
package agent
import (
"beszel/internal/entities/system"
"bufio"
"encoding/json"
"fmt"
"os/exec"
"strconv"
"strings"
"sync"
"time"
"golang.org/x/exp/slog"
)
type GPUManager struct {
nvidiaSmi bool
rocmSmi bool
GpuDataMap map[string]*system.GPUData
mutex sync.Mutex
}
type RocmSmiJson struct {
ID string `json:"Device ID"`
Name string `json:"Card series"`
Temperature string `json:"Temperature (Sensor edge) (C)"`
MemoryUsed string `json:"VRAM Total Used Memory (B)"`
MemoryTotal string `json:"VRAM Total Memory (B)"`
Usage string `json:"GPU use (%)"`
Power string `json:"Current Socket Graphics Package Power (W)"`
}
// startNvidiaCollector oversees collectNvidiaStats and restarts nvidia-smi if it fails
func (gm *GPUManager) startNvidiaCollector() error {
for {
if err := gm.collectNvidiaStats(); err != nil {
slog.Warn("Restarting nvidia-smi", "err", err)
time.Sleep(time.Second) // Wait before retrying
continue
}
}
}
// collectNvidiaStats runs nvidia-smi in a loop and passes the output to parseNvidiaData
func (gm *GPUManager) collectNvidiaStats() error {
// Set up the command
cmd := exec.Command("nvidia-smi", "-l", "4", "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--format=csv,noheader,nounits")
// Set up a pipe to capture stdout
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
// Start the command
if err := cmd.Start(); err != nil {
return err
}
// Use a scanner to read each line of output
scanner := bufio.NewScanner(stdout)
buf := make([]byte, 0, 64*1024) // 64KB buffer
scanner.Buffer(buf, bufio.MaxScanTokenSize)
for scanner.Scan() {
line := scanner.Bytes()
gm.parseNvidiaData(line) // Run your function on each new line
}
// Check for any errors encountered during scanning
if err := scanner.Err(); err != nil {
return err
}
// Wait for the command to complete
return cmd.Wait()
}
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
func (gm *GPUManager) parseNvidiaData(output []byte) {
gm.mutex.Lock()
defer gm.mutex.Unlock()
lines := strings.Split(string(output), "\n")
for _, line := range lines {
if line != "" {
fields := strings.Split(line, ", ")
if len(fields) >= 7 {
id := fields[0]
temp, _ := strconv.ParseFloat(fields[2], 64)
memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
totalMemory, _ := strconv.ParseFloat(fields[4], 64)
usage, _ := strconv.ParseFloat(fields[5], 64)
power, _ := strconv.ParseFloat(fields[6], 64)
// add gpu if not exists
if _, ok := gm.GpuDataMap[id]; !ok {
name := strings.TrimPrefix(fields[1], "NVIDIA ")
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
}
// update gpu data
gpu := gm.GpuDataMap[id]
gpu.Temperature += temp
gpu.MemoryUsed += memoryUsage
gpu.MemoryTotal += totalMemory
gpu.Usage += usage
gpu.Power += power
gpu.Count++
}
}
}
}
// startAmdCollector oversees collectAmdStats and restarts rocm-smi if it fails
func (gm *GPUManager) startAmdCollector() {
for {
if err := gm.collectAmdStats(); err != nil {
slog.Warn("Restarting rocm-smi", "err", err)
time.Sleep(time.Second) // Wait before retrying
continue
} else {
// break if no error (command runs but no card found)
break
}
}
}
// collectAmdStats runs rocm-smi in a loop and passes the output to parseAmdData
func (gm *GPUManager) collectAmdStats() error {
cmd := exec.Command("/bin/sh", "-c", "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.7; done")
// Set up a pipe to capture stdout
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
// Start the command
if err := cmd.Start(); err != nil {
return err
}
// Use a scanner to read each line of output
scanner := bufio.NewScanner(stdout)
buf := make([]byte, 0, 64*1024) // 64KB buffer
scanner.Buffer(buf, bufio.MaxScanTokenSize)
for scanner.Scan() {
var rocmSmiInfo map[string]RocmSmiJson
if err := json.Unmarshal(scanner.Bytes(), &rocmSmiInfo); err != nil {
return err
}
if len(rocmSmiInfo) > 0 {
// slog.Info("rocm-smi", "data", rocmSmiInfo)
gm.parseAmdData(&rocmSmiInfo)
} else {
slog.Warn("rocm-smi returned no GPU")
return nil
}
}
if err := scanner.Err(); err != nil {
return err
}
return cmd.Wait()
}
// parseAmdData parses the output of rocm-smi and updates the GPUData map
func (gm *GPUManager) parseAmdData(rocmSmiInfo *map[string]RocmSmiJson) {
for _, v := range *rocmSmiInfo {
temp, _ := strconv.ParseFloat(v.Temperature, 64)
memoryUsage, _ := strconv.ParseFloat(v.MemoryUsed, 64)
totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
usage, _ := strconv.ParseFloat(v.Usage, 64)
power, _ := strconv.ParseFloat(v.Power, 64)
memoryUsage = bytesToMegabytes(memoryUsage)
totalMemory = bytesToMegabytes(totalMemory)
if _, ok := gm.GpuDataMap[v.ID]; !ok {
gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name}
}
gpu := gm.GpuDataMap[v.ID]
gpu.Temperature += temp
gpu.MemoryUsed += memoryUsage
gpu.MemoryTotal += totalMemory
gpu.Usage += usage
gpu.Power += power
gpu.Count++
}
}
// sums and resets the current GPU utilization data since the last update
func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
gm.mutex.Lock()
defer gm.mutex.Unlock()
// copy / reset the data
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
for id, gpu := range gm.GpuDataMap {
// sum the data
gpu.Temperature = twoDecimals(gpu.Temperature / gpu.Count)
gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed / gpu.Count)
gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal / gpu.Count)
gpu.Usage = twoDecimals(gpu.Usage / gpu.Count)
gpu.Power = twoDecimals(gpu.Power / gpu.Count)
gpuData[id] = *gpu
// reset the data
gpu.Temperature = 0
gpu.MemoryUsed = 0
gpu.MemoryTotal = 0
gpu.Usage = 0
gpu.Power = 0
gpu.Count = 0
}
return gpuData
}
// detectGPU returns the GPU brand (nvidia or amd) or an error if none is found
// todo: make sure there's actually a GPU, not just if the command exists
func (gm *GPUManager) detectGPU() error {
if err := exec.Command("nvidia-smi").Run(); err == nil {
gm.nvidiaSmi = true
}
if err := exec.Command("rocm-smi").Run(); err == nil {
gm.rocmSmi = true
}
if gm.nvidiaSmi || gm.rocmSmi {
return nil
}
return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi")
}
// NewGPUManager returns a new GPUManager
func NewGPUManager() (*GPUManager, error) {
var gm GPUManager
if err := gm.detectGPU(); err != nil {
return nil, err
}
gm.GpuDataMap = make(map[string]*system.GPUData, 1)
if gm.nvidiaSmi {
go gm.startNvidiaCollector()
}
if gm.rocmSmi {
go gm.startAmdCollector()
}
return &gm, nil
}

View File

@@ -206,6 +206,22 @@ func (a *Agent) getSystemStats() system.Stats {
} }
} }
// GPU data
if a.gpuManager != nil {
if gpuData := a.gpuManager.GetCurrentData(); len(gpuData) > 0 {
systemStats.GPUData = gpuData
// add temperatures
if systemStats.Temperatures == nil {
systemStats.Temperatures = make(map[string]float64, len(gpuData))
}
for _, gpu := range gpuData {
if gpu.Temperature > 0 {
systemStats.Temperatures[gpu.Name] = gpu.Temperature
}
}
}
}
// update base system info // update base system info
a.systemInfo.Cpu = systemStats.Cpu a.systemInfo.Cpu = systemStats.Cpu
a.systemInfo.MemPct = systemStats.MemPct a.systemInfo.MemPct = systemStats.MemPct

View File

@@ -345,6 +345,7 @@ func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
} }
} }
// todo: allow x minutes downtime before sending alert
func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *models.Record) error { func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *models.Record) error {
var alertStatus string var alertStatus string
switch newStatus { switch newStatus {

View File

@@ -28,6 +28,17 @@ type Stats struct {
MaxNetworkRecv float64 `json:"nrm,omitempty"` MaxNetworkRecv float64 `json:"nrm,omitempty"`
Temperatures map[string]float64 `json:"t,omitempty"` Temperatures map[string]float64 `json:"t,omitempty"`
ExtraFs map[string]*FsStats `json:"efs,omitempty"` ExtraFs map[string]*FsStats `json:"efs,omitempty"`
GPUData map[string]GPUData `json:"g,omitempty"`
}
type GPUData struct {
Name string `json:"n"`
Temperature float64 `json:"-"`
MemoryUsed float64 `json:"mu,omitempty"`
MemoryTotal float64 `json:"mt,omitempty"`
Usage float64 `json:"u"`
Power float64 `json:"p,omitempty"`
Count float64 `json:"-"`
} }
type FsStats struct { type FsStats struct {

View File

@@ -149,11 +149,7 @@ func (rm *RecordManager) CreateLongerRecords(collections []*models.Collection) {
// Calculate the average stats of a list of system_stats records without reflect // Calculate the average stats of a list of system_stats records without reflect
func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats { func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
sum := system.Stats{ sum := system.Stats{}
Temperatures: make(map[string]float64),
ExtraFs: make(map[string]*system.FsStats),
}
count := float64(len(records)) count := float64(len(records))
// use different counter for temps in case some records don't have them // use different counter for temps in case some records don't have them
tempCount := float64(0) tempCount := float64(0)
@@ -184,6 +180,9 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
sum.MaxDiskWritePs = max(sum.MaxDiskWritePs, stats.MaxDiskWritePs, stats.DiskWritePs) sum.MaxDiskWritePs = max(sum.MaxDiskWritePs, stats.MaxDiskWritePs, stats.DiskWritePs)
// add temps to sum // add temps to sum
if stats.Temperatures != nil { if stats.Temperatures != nil {
if sum.Temperatures == nil {
sum.Temperatures = make(map[string]float64, len(stats.Temperatures))
}
tempCount++ tempCount++
for key, value := range stats.Temperatures { for key, value := range stats.Temperatures {
if _, ok := sum.Temperatures[key]; !ok { if _, ok := sum.Temperatures[key]; !ok {
@@ -194,6 +193,9 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
} }
// add extra fs to sum // add extra fs to sum
if stats.ExtraFs != nil { if stats.ExtraFs != nil {
if sum.ExtraFs == nil {
sum.ExtraFs = make(map[string]*system.FsStats, len(stats.ExtraFs))
}
for key, value := range stats.ExtraFs { for key, value := range stats.ExtraFs {
if _, ok := sum.ExtraFs[key]; !ok { if _, ok := sum.ExtraFs[key]; !ok {
sum.ExtraFs[key] = &system.FsStats{} sum.ExtraFs[key] = &system.FsStats{}
@@ -207,6 +209,25 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
sum.ExtraFs[key].MaxDiskWritePS = max(sum.ExtraFs[key].MaxDiskWritePS, value.MaxDiskWritePS, value.DiskWritePs) sum.ExtraFs[key].MaxDiskWritePS = max(sum.ExtraFs[key].MaxDiskWritePS, value.MaxDiskWritePS, value.DiskWritePs)
} }
} }
// add GPU data
if stats.GPUData != nil {
if sum.GPUData == nil {
sum.GPUData = make(map[string]system.GPUData, len(stats.GPUData))
}
for id, value := range stats.GPUData {
if _, ok := sum.GPUData[id]; !ok {
sum.GPUData[id] = system.GPUData{Name: value.Name}
}
gpu := sum.GPUData[id]
gpu.Temperature += value.Temperature
gpu.MemoryUsed += value.MemoryUsed
gpu.MemoryTotal += value.MemoryTotal
gpu.Usage += value.Usage
gpu.Power += value.Power
gpu.Count += value.Count
sum.GPUData[id] = gpu
}
}
} }
stats = system.Stats{ stats = system.Stats{
@@ -232,14 +253,14 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
MaxNetworkRecv: sum.MaxNetworkRecv, MaxNetworkRecv: sum.MaxNetworkRecv,
} }
if len(sum.Temperatures) != 0 { if sum.Temperatures != nil {
stats.Temperatures = make(map[string]float64, len(sum.Temperatures)) stats.Temperatures = make(map[string]float64, len(sum.Temperatures))
for key, value := range sum.Temperatures { for key, value := range sum.Temperatures {
stats.Temperatures[key] = twoDecimals(value / tempCount) stats.Temperatures[key] = twoDecimals(value / tempCount)
} }
} }
if len(sum.ExtraFs) != 0 { if sum.ExtraFs != nil {
stats.ExtraFs = make(map[string]*system.FsStats, len(sum.ExtraFs)) stats.ExtraFs = make(map[string]*system.FsStats, len(sum.ExtraFs))
for key, value := range sum.ExtraFs { for key, value := range sum.ExtraFs {
stats.ExtraFs[key] = &system.FsStats{ stats.ExtraFs[key] = &system.FsStats{
@@ -253,6 +274,21 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
} }
} }
if sum.GPUData != nil {
stats.GPUData = make(map[string]system.GPUData, len(sum.GPUData))
for id, value := range sum.GPUData {
stats.GPUData[id] = system.GPUData{
Name: value.Name,
Temperature: twoDecimals(value.Temperature / count),
MemoryUsed: twoDecimals(value.MemoryUsed / count),
MemoryTotal: twoDecimals(value.MemoryTotal / count),
Usage: twoDecimals(value.Usage / count),
Power: twoDecimals(value.Power / count),
Count: twoDecimals(value.Count / count),
}
}
}
return stats return stats
} }