mirror of
https://github.com/fankes/beszel.git
synced 2025-10-20 02:09:28 +08:00
progress on gpu metrics
This commit is contained in:
@@ -24,6 +24,7 @@ type Agent struct {
|
|||||||
sensorsContext context.Context // Sensors context to override sys location
|
sensorsContext context.Context // Sensors context to override sys location
|
||||||
sensorsWhitelist map[string]struct{} // List of sensors to monitor
|
sensorsWhitelist map[string]struct{} // List of sensors to monitor
|
||||||
systemInfo system.Info // Host system info
|
systemInfo system.Info // Host system info
|
||||||
|
gpuManager *GPUManager // Manages GPU data
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewAgent() *Agent {
|
func NewAgent() *Agent {
|
||||||
@@ -74,6 +75,13 @@ func (a *Agent) Run(pubKey []byte, addr string) {
|
|||||||
a.initializeNetIoStats()
|
a.initializeNetIoStats()
|
||||||
a.dockerManager = newDockerManager()
|
a.dockerManager = newDockerManager()
|
||||||
|
|
||||||
|
// initialize GPU manager
|
||||||
|
if gm, err := NewGPUManager(); err != nil {
|
||||||
|
slog.Error("GPU manager", "err", err)
|
||||||
|
} else {
|
||||||
|
a.gpuManager = gm
|
||||||
|
}
|
||||||
|
|
||||||
// if debugging, print stats
|
// if debugging, print stats
|
||||||
if a.debug {
|
if a.debug {
|
||||||
slog.Debug("Stats", "data", a.gatherStats())
|
slog.Debug("Stats", "data", a.gatherStats())
|
||||||
|
234
beszel/internal/agent/gpu.go
Normal file
234
beszel/internal/agent/gpu.go
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
package agent
|
||||||
|
|
||||||
|
import (
|
||||||
|
"beszel/internal/entities/system"
|
||||||
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slog"
|
||||||
|
)
|
||||||
|
|
||||||
|
type GPUManager struct {
|
||||||
|
nvidiaSmi bool
|
||||||
|
rocmSmi bool
|
||||||
|
GpuDataMap map[string]*system.GPUData
|
||||||
|
mutex sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
type RocmSmiJson struct {
|
||||||
|
ID string `json:"Device ID"`
|
||||||
|
Name string `json:"Card series"`
|
||||||
|
Temperature string `json:"Temperature (Sensor edge) (C)"`
|
||||||
|
MemoryUsed string `json:"VRAM Total Used Memory (B)"`
|
||||||
|
MemoryTotal string `json:"VRAM Total Memory (B)"`
|
||||||
|
Usage string `json:"GPU use (%)"`
|
||||||
|
Power string `json:"Current Socket Graphics Package Power (W)"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// startNvidiaCollector oversees collectNvidiaStats and restarts nvidia-smi if it fails
|
||||||
|
func (gm *GPUManager) startNvidiaCollector() error {
|
||||||
|
for {
|
||||||
|
if err := gm.collectNvidiaStats(); err != nil {
|
||||||
|
slog.Warn("Restarting nvidia-smi", "err", err)
|
||||||
|
time.Sleep(time.Second) // Wait before retrying
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectNvidiaStats runs nvidia-smi in a loop and passes the output to parseNvidiaData
|
||||||
|
func (gm *GPUManager) collectNvidiaStats() error {
|
||||||
|
// Set up the command
|
||||||
|
cmd := exec.Command("nvidia-smi", "-l", "4", "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--format=csv,noheader,nounits")
|
||||||
|
// Set up a pipe to capture stdout
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Start the command
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Use a scanner to read each line of output
|
||||||
|
scanner := bufio.NewScanner(stdout)
|
||||||
|
buf := make([]byte, 0, 64*1024) // 64KB buffer
|
||||||
|
scanner.Buffer(buf, bufio.MaxScanTokenSize)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Bytes()
|
||||||
|
gm.parseNvidiaData(line) // Run your function on each new line
|
||||||
|
}
|
||||||
|
// Check for any errors encountered during scanning
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Wait for the command to complete
|
||||||
|
return cmd.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
|
||||||
|
func (gm *GPUManager) parseNvidiaData(output []byte) {
|
||||||
|
gm.mutex.Lock()
|
||||||
|
defer gm.mutex.Unlock()
|
||||||
|
lines := strings.Split(string(output), "\n")
|
||||||
|
for _, line := range lines {
|
||||||
|
if line != "" {
|
||||||
|
fields := strings.Split(line, ", ")
|
||||||
|
if len(fields) >= 7 {
|
||||||
|
id := fields[0]
|
||||||
|
temp, _ := strconv.ParseFloat(fields[2], 64)
|
||||||
|
memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
|
||||||
|
totalMemory, _ := strconv.ParseFloat(fields[4], 64)
|
||||||
|
usage, _ := strconv.ParseFloat(fields[5], 64)
|
||||||
|
power, _ := strconv.ParseFloat(fields[6], 64)
|
||||||
|
// add gpu if not exists
|
||||||
|
if _, ok := gm.GpuDataMap[id]; !ok {
|
||||||
|
name := strings.TrimPrefix(fields[1], "NVIDIA ")
|
||||||
|
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
||||||
|
}
|
||||||
|
// update gpu data
|
||||||
|
gpu := gm.GpuDataMap[id]
|
||||||
|
gpu.Temperature += temp
|
||||||
|
gpu.MemoryUsed += memoryUsage
|
||||||
|
gpu.MemoryTotal += totalMemory
|
||||||
|
gpu.Usage += usage
|
||||||
|
gpu.Power += power
|
||||||
|
gpu.Count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// startAmdCollector oversees collectAmdStats and restarts rocm-smi if it fails
|
||||||
|
func (gm *GPUManager) startAmdCollector() {
|
||||||
|
for {
|
||||||
|
if err := gm.collectAmdStats(); err != nil {
|
||||||
|
slog.Warn("Restarting rocm-smi", "err", err)
|
||||||
|
time.Sleep(time.Second) // Wait before retrying
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
// break if no error (command runs but no card found)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectAmdStats runs rocm-smi in a loop and passes the output to parseAmdData
|
||||||
|
func (gm *GPUManager) collectAmdStats() error {
|
||||||
|
cmd := exec.Command("/bin/sh", "-c", "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.7; done")
|
||||||
|
// Set up a pipe to capture stdout
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Start the command
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Use a scanner to read each line of output
|
||||||
|
scanner := bufio.NewScanner(stdout)
|
||||||
|
buf := make([]byte, 0, 64*1024) // 64KB buffer
|
||||||
|
scanner.Buffer(buf, bufio.MaxScanTokenSize)
|
||||||
|
for scanner.Scan() {
|
||||||
|
var rocmSmiInfo map[string]RocmSmiJson
|
||||||
|
if err := json.Unmarshal(scanner.Bytes(), &rocmSmiInfo); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if len(rocmSmiInfo) > 0 {
|
||||||
|
// slog.Info("rocm-smi", "data", rocmSmiInfo)
|
||||||
|
gm.parseAmdData(&rocmSmiInfo)
|
||||||
|
} else {
|
||||||
|
slog.Warn("rocm-smi returned no GPU")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return cmd.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseAmdData parses the output of rocm-smi and updates the GPUData map
|
||||||
|
func (gm *GPUManager) parseAmdData(rocmSmiInfo *map[string]RocmSmiJson) {
|
||||||
|
for _, v := range *rocmSmiInfo {
|
||||||
|
temp, _ := strconv.ParseFloat(v.Temperature, 64)
|
||||||
|
memoryUsage, _ := strconv.ParseFloat(v.MemoryUsed, 64)
|
||||||
|
totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
|
||||||
|
usage, _ := strconv.ParseFloat(v.Usage, 64)
|
||||||
|
power, _ := strconv.ParseFloat(v.Power, 64)
|
||||||
|
memoryUsage = bytesToMegabytes(memoryUsage)
|
||||||
|
totalMemory = bytesToMegabytes(totalMemory)
|
||||||
|
|
||||||
|
if _, ok := gm.GpuDataMap[v.ID]; !ok {
|
||||||
|
gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name}
|
||||||
|
}
|
||||||
|
gpu := gm.GpuDataMap[v.ID]
|
||||||
|
gpu.Temperature += temp
|
||||||
|
gpu.MemoryUsed += memoryUsage
|
||||||
|
gpu.MemoryTotal += totalMemory
|
||||||
|
gpu.Usage += usage
|
||||||
|
gpu.Power += power
|
||||||
|
gpu.Count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sums and resets the current GPU utilization data since the last update
|
||||||
|
func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
|
||||||
|
gm.mutex.Lock()
|
||||||
|
defer gm.mutex.Unlock()
|
||||||
|
// copy / reset the data
|
||||||
|
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
||||||
|
for id, gpu := range gm.GpuDataMap {
|
||||||
|
// sum the data
|
||||||
|
gpu.Temperature = twoDecimals(gpu.Temperature / gpu.Count)
|
||||||
|
gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed / gpu.Count)
|
||||||
|
gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal / gpu.Count)
|
||||||
|
gpu.Usage = twoDecimals(gpu.Usage / gpu.Count)
|
||||||
|
gpu.Power = twoDecimals(gpu.Power / gpu.Count)
|
||||||
|
gpuData[id] = *gpu
|
||||||
|
// reset the data
|
||||||
|
gpu.Temperature = 0
|
||||||
|
gpu.MemoryUsed = 0
|
||||||
|
gpu.MemoryTotal = 0
|
||||||
|
gpu.Usage = 0
|
||||||
|
gpu.Power = 0
|
||||||
|
gpu.Count = 0
|
||||||
|
}
|
||||||
|
return gpuData
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectGPU returns the GPU brand (nvidia or amd) or an error if none is found
|
||||||
|
// todo: make sure there's actually a GPU, not just if the command exists
|
||||||
|
func (gm *GPUManager) detectGPU() error {
|
||||||
|
if err := exec.Command("nvidia-smi").Run(); err == nil {
|
||||||
|
gm.nvidiaSmi = true
|
||||||
|
}
|
||||||
|
if err := exec.Command("rocm-smi").Run(); err == nil {
|
||||||
|
gm.rocmSmi = true
|
||||||
|
}
|
||||||
|
if gm.nvidiaSmi || gm.rocmSmi {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi")
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewGPUManager returns a new GPUManager
|
||||||
|
func NewGPUManager() (*GPUManager, error) {
|
||||||
|
var gm GPUManager
|
||||||
|
if err := gm.detectGPU(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
gm.GpuDataMap = make(map[string]*system.GPUData, 1)
|
||||||
|
if gm.nvidiaSmi {
|
||||||
|
go gm.startNvidiaCollector()
|
||||||
|
}
|
||||||
|
if gm.rocmSmi {
|
||||||
|
go gm.startAmdCollector()
|
||||||
|
}
|
||||||
|
return &gm, nil
|
||||||
|
}
|
@@ -206,6 +206,22 @@ func (a *Agent) getSystemStats() system.Stats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GPU data
|
||||||
|
if a.gpuManager != nil {
|
||||||
|
if gpuData := a.gpuManager.GetCurrentData(); len(gpuData) > 0 {
|
||||||
|
systemStats.GPUData = gpuData
|
||||||
|
// add temperatures
|
||||||
|
if systemStats.Temperatures == nil {
|
||||||
|
systemStats.Temperatures = make(map[string]float64, len(gpuData))
|
||||||
|
}
|
||||||
|
for _, gpu := range gpuData {
|
||||||
|
if gpu.Temperature > 0 {
|
||||||
|
systemStats.Temperatures[gpu.Name] = gpu.Temperature
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// update base system info
|
// update base system info
|
||||||
a.systemInfo.Cpu = systemStats.Cpu
|
a.systemInfo.Cpu = systemStats.Cpu
|
||||||
a.systemInfo.MemPct = systemStats.MemPct
|
a.systemInfo.MemPct = systemStats.MemPct
|
||||||
|
@@ -345,6 +345,7 @@ func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// todo: allow x minutes downtime before sending alert
|
||||||
func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *models.Record) error {
|
func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *models.Record) error {
|
||||||
var alertStatus string
|
var alertStatus string
|
||||||
switch newStatus {
|
switch newStatus {
|
||||||
|
@@ -28,6 +28,17 @@ type Stats struct {
|
|||||||
MaxNetworkRecv float64 `json:"nrm,omitempty"`
|
MaxNetworkRecv float64 `json:"nrm,omitempty"`
|
||||||
Temperatures map[string]float64 `json:"t,omitempty"`
|
Temperatures map[string]float64 `json:"t,omitempty"`
|
||||||
ExtraFs map[string]*FsStats `json:"efs,omitempty"`
|
ExtraFs map[string]*FsStats `json:"efs,omitempty"`
|
||||||
|
GPUData map[string]GPUData `json:"g,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type GPUData struct {
|
||||||
|
Name string `json:"n"`
|
||||||
|
Temperature float64 `json:"-"`
|
||||||
|
MemoryUsed float64 `json:"mu,omitempty"`
|
||||||
|
MemoryTotal float64 `json:"mt,omitempty"`
|
||||||
|
Usage float64 `json:"u"`
|
||||||
|
Power float64 `json:"p,omitempty"`
|
||||||
|
Count float64 `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type FsStats struct {
|
type FsStats struct {
|
||||||
|
@@ -149,11 +149,7 @@ func (rm *RecordManager) CreateLongerRecords(collections []*models.Collection) {
|
|||||||
|
|
||||||
// Calculate the average stats of a list of system_stats records without reflect
|
// Calculate the average stats of a list of system_stats records without reflect
|
||||||
func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
||||||
sum := system.Stats{
|
sum := system.Stats{}
|
||||||
Temperatures: make(map[string]float64),
|
|
||||||
ExtraFs: make(map[string]*system.FsStats),
|
|
||||||
}
|
|
||||||
|
|
||||||
count := float64(len(records))
|
count := float64(len(records))
|
||||||
// use different counter for temps in case some records don't have them
|
// use different counter for temps in case some records don't have them
|
||||||
tempCount := float64(0)
|
tempCount := float64(0)
|
||||||
@@ -184,6 +180,9 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
|||||||
sum.MaxDiskWritePs = max(sum.MaxDiskWritePs, stats.MaxDiskWritePs, stats.DiskWritePs)
|
sum.MaxDiskWritePs = max(sum.MaxDiskWritePs, stats.MaxDiskWritePs, stats.DiskWritePs)
|
||||||
// add temps to sum
|
// add temps to sum
|
||||||
if stats.Temperatures != nil {
|
if stats.Temperatures != nil {
|
||||||
|
if sum.Temperatures == nil {
|
||||||
|
sum.Temperatures = make(map[string]float64, len(stats.Temperatures))
|
||||||
|
}
|
||||||
tempCount++
|
tempCount++
|
||||||
for key, value := range stats.Temperatures {
|
for key, value := range stats.Temperatures {
|
||||||
if _, ok := sum.Temperatures[key]; !ok {
|
if _, ok := sum.Temperatures[key]; !ok {
|
||||||
@@ -194,6 +193,9 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
|||||||
}
|
}
|
||||||
// add extra fs to sum
|
// add extra fs to sum
|
||||||
if stats.ExtraFs != nil {
|
if stats.ExtraFs != nil {
|
||||||
|
if sum.ExtraFs == nil {
|
||||||
|
sum.ExtraFs = make(map[string]*system.FsStats, len(stats.ExtraFs))
|
||||||
|
}
|
||||||
for key, value := range stats.ExtraFs {
|
for key, value := range stats.ExtraFs {
|
||||||
if _, ok := sum.ExtraFs[key]; !ok {
|
if _, ok := sum.ExtraFs[key]; !ok {
|
||||||
sum.ExtraFs[key] = &system.FsStats{}
|
sum.ExtraFs[key] = &system.FsStats{}
|
||||||
@@ -207,6 +209,25 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
|||||||
sum.ExtraFs[key].MaxDiskWritePS = max(sum.ExtraFs[key].MaxDiskWritePS, value.MaxDiskWritePS, value.DiskWritePs)
|
sum.ExtraFs[key].MaxDiskWritePS = max(sum.ExtraFs[key].MaxDiskWritePS, value.MaxDiskWritePS, value.DiskWritePs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// add GPU data
|
||||||
|
if stats.GPUData != nil {
|
||||||
|
if sum.GPUData == nil {
|
||||||
|
sum.GPUData = make(map[string]system.GPUData, len(stats.GPUData))
|
||||||
|
}
|
||||||
|
for id, value := range stats.GPUData {
|
||||||
|
if _, ok := sum.GPUData[id]; !ok {
|
||||||
|
sum.GPUData[id] = system.GPUData{Name: value.Name}
|
||||||
|
}
|
||||||
|
gpu := sum.GPUData[id]
|
||||||
|
gpu.Temperature += value.Temperature
|
||||||
|
gpu.MemoryUsed += value.MemoryUsed
|
||||||
|
gpu.MemoryTotal += value.MemoryTotal
|
||||||
|
gpu.Usage += value.Usage
|
||||||
|
gpu.Power += value.Power
|
||||||
|
gpu.Count += value.Count
|
||||||
|
sum.GPUData[id] = gpu
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
stats = system.Stats{
|
stats = system.Stats{
|
||||||
@@ -232,14 +253,14 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
|||||||
MaxNetworkRecv: sum.MaxNetworkRecv,
|
MaxNetworkRecv: sum.MaxNetworkRecv,
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(sum.Temperatures) != 0 {
|
if sum.Temperatures != nil {
|
||||||
stats.Temperatures = make(map[string]float64, len(sum.Temperatures))
|
stats.Temperatures = make(map[string]float64, len(sum.Temperatures))
|
||||||
for key, value := range sum.Temperatures {
|
for key, value := range sum.Temperatures {
|
||||||
stats.Temperatures[key] = twoDecimals(value / tempCount)
|
stats.Temperatures[key] = twoDecimals(value / tempCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(sum.ExtraFs) != 0 {
|
if sum.ExtraFs != nil {
|
||||||
stats.ExtraFs = make(map[string]*system.FsStats, len(sum.ExtraFs))
|
stats.ExtraFs = make(map[string]*system.FsStats, len(sum.ExtraFs))
|
||||||
for key, value := range sum.ExtraFs {
|
for key, value := range sum.ExtraFs {
|
||||||
stats.ExtraFs[key] = &system.FsStats{
|
stats.ExtraFs[key] = &system.FsStats{
|
||||||
@@ -253,6 +274,21 @@ func (rm *RecordManager) AverageSystemStats(records RecordStats) system.Stats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if sum.GPUData != nil {
|
||||||
|
stats.GPUData = make(map[string]system.GPUData, len(sum.GPUData))
|
||||||
|
for id, value := range sum.GPUData {
|
||||||
|
stats.GPUData[id] = system.GPUData{
|
||||||
|
Name: value.Name,
|
||||||
|
Temperature: twoDecimals(value.Temperature / count),
|
||||||
|
MemoryUsed: twoDecimals(value.MemoryUsed / count),
|
||||||
|
MemoryTotal: twoDecimals(value.MemoryTotal / count),
|
||||||
|
Usage: twoDecimals(value.Usage / count),
|
||||||
|
Power: twoDecimals(value.Power / count),
|
||||||
|
Count: twoDecimals(value.Count / count),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user