mirror of
https://github.com/fankes/beszel.git
synced 2025-10-18 17:29:28 +08:00
update project structure
- move agent to /agent - change /beszel to /src - update workflows and docker builds
This commit is contained in:
349
agent/gpu.go
Normal file
349
agent/gpu.go
Normal file
@@ -0,0 +1,349 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/src/entities/system"
|
||||
|
||||
"golang.org/x/exp/slog"
|
||||
)
|
||||
|
||||
const (
|
||||
// Commands
|
||||
nvidiaSmiCmd string = "nvidia-smi"
|
||||
rocmSmiCmd string = "rocm-smi"
|
||||
tegraStatsCmd string = "tegrastats"
|
||||
|
||||
// Polling intervals
|
||||
nvidiaSmiInterval string = "4" // in seconds
|
||||
tegraStatsInterval string = "3700" // in milliseconds
|
||||
rocmSmiInterval time.Duration = 4300 * time.Millisecond
|
||||
|
||||
// Command retry and timeout constants
|
||||
retryWaitTime time.Duration = 5 * time.Second
|
||||
maxFailureRetries int = 5
|
||||
|
||||
cmdBufferSize uint16 = 10 * 1024
|
||||
|
||||
// Unit Conversions
|
||||
mebibytesInAMegabyte float64 = 1.024 // nvidia-smi reports memory in MiB
|
||||
milliwattsInAWatt float64 = 1000.0 // tegrastats reports power in mW
|
||||
)
|
||||
|
||||
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
|
||||
type GPUManager struct {
|
||||
sync.Mutex
|
||||
nvidiaSmi bool
|
||||
rocmSmi bool
|
||||
tegrastats bool
|
||||
GpuDataMap map[string]*system.GPUData
|
||||
}
|
||||
|
||||
// RocmSmiJson represents the JSON structure of rocm-smi output
|
||||
type RocmSmiJson struct {
|
||||
ID string `json:"GUID"`
|
||||
Name string `json:"Card series"`
|
||||
Temperature string `json:"Temperature (Sensor edge) (C)"`
|
||||
MemoryUsed string `json:"VRAM Total Used Memory (B)"`
|
||||
MemoryTotal string `json:"VRAM Total Memory (B)"`
|
||||
Usage string `json:"GPU use (%)"`
|
||||
PowerPackage string `json:"Average Graphics Package Power (W)"`
|
||||
PowerSocket string `json:"Current Socket Graphics Package Power (W)"`
|
||||
}
|
||||
|
||||
// gpuCollector defines a collector for a specific GPU management utility (nvidia-smi or rocm-smi)
|
||||
type gpuCollector struct {
|
||||
name string
|
||||
cmdArgs []string
|
||||
parse func([]byte) bool // returns true if valid data was found
|
||||
buf []byte
|
||||
}
|
||||
|
||||
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
|
||||
|
||||
// starts and manages the ongoing collection of GPU data for the specified GPU management utility
|
||||
func (c *gpuCollector) start() {
|
||||
for {
|
||||
err := c.collect()
|
||||
if err != nil {
|
||||
if err == errNoValidData {
|
||||
slog.Warn(c.name + " found no valid GPU data, stopping")
|
||||
break
|
||||
}
|
||||
slog.Warn(c.name+" failed, restarting", "err", err)
|
||||
time.Sleep(retryWaitTime)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// collect executes the command, parses output with the assigned parser function
|
||||
func (c *gpuCollector) collect() error {
|
||||
cmd := exec.Command(c.name, c.cmdArgs...)
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(stdout)
|
||||
if c.buf == nil {
|
||||
c.buf = make([]byte, 0, cmdBufferSize)
|
||||
}
|
||||
scanner.Buffer(c.buf, bufio.MaxScanTokenSize)
|
||||
|
||||
for scanner.Scan() {
|
||||
hasValidData := c.parse(scanner.Bytes())
|
||||
if !hasValidData {
|
||||
return errNoValidData
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return fmt.Errorf("scanner error: %w", err)
|
||||
}
|
||||
return cmd.Wait()
|
||||
}
|
||||
|
||||
// getJetsonParser returns a function to parse the output of tegrastats and update the GPUData map
|
||||
func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
|
||||
// use closure to avoid recompiling the regex
|
||||
ramPattern := regexp.MustCompile(`RAM (\d+)/(\d+)MB`)
|
||||
gr3dPattern := regexp.MustCompile(`GR3D_FREQ (\d+)%`)
|
||||
tempPattern := regexp.MustCompile(`tj@(\d+\.?\d*)C`)
|
||||
// Orin Nano / NX do not have GPU specific power monitor
|
||||
// TODO: Maybe use VDD_IN for Nano / NX and add a total system power chart
|
||||
powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`)
|
||||
|
||||
// jetson devices have only one gpu so we'll just initialize here
|
||||
gpuData := &system.GPUData{Name: "GPU"}
|
||||
gm.GpuDataMap["0"] = gpuData
|
||||
|
||||
return func(output []byte) bool {
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
// Parse RAM usage
|
||||
ramMatches := ramPattern.FindSubmatch(output)
|
||||
if ramMatches != nil {
|
||||
gpuData.MemoryUsed, _ = strconv.ParseFloat(string(ramMatches[1]), 64)
|
||||
gpuData.MemoryTotal, _ = strconv.ParseFloat(string(ramMatches[2]), 64)
|
||||
}
|
||||
// Parse GR3D (GPU) usage
|
||||
gr3dMatches := gr3dPattern.FindSubmatch(output)
|
||||
if gr3dMatches != nil {
|
||||
gr3dUsage, _ := strconv.ParseFloat(string(gr3dMatches[1]), 64)
|
||||
gpuData.Usage += gr3dUsage
|
||||
}
|
||||
// Parse temperature
|
||||
tempMatches := tempPattern.FindSubmatch(output)
|
||||
if tempMatches != nil {
|
||||
gpuData.Temperature, _ = strconv.ParseFloat(string(tempMatches[1]), 64)
|
||||
}
|
||||
// Parse power usage
|
||||
powerMatches := powerPattern.FindSubmatch(output)
|
||||
if powerMatches != nil {
|
||||
power, _ := strconv.ParseFloat(string(powerMatches[2]), 64)
|
||||
gpuData.Power += power / milliwattsInAWatt
|
||||
}
|
||||
gpuData.Count++
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
|
||||
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
scanner := bufio.NewScanner(bytes.NewReader(output))
|
||||
var valid bool
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text() // Or use scanner.Bytes() for []byte
|
||||
fields := strings.Split(strings.TrimSpace(line), ", ")
|
||||
if len(fields) < 7 {
|
||||
continue
|
||||
}
|
||||
valid = true
|
||||
id := fields[0]
|
||||
temp, _ := strconv.ParseFloat(fields[2], 64)
|
||||
memoryUsage, _ := strconv.ParseFloat(fields[3], 64)
|
||||
totalMemory, _ := strconv.ParseFloat(fields[4], 64)
|
||||
usage, _ := strconv.ParseFloat(fields[5], 64)
|
||||
power, _ := strconv.ParseFloat(fields[6], 64)
|
||||
// add gpu if not exists
|
||||
if _, ok := gm.GpuDataMap[id]; !ok {
|
||||
name := strings.TrimPrefix(fields[1], "NVIDIA ")
|
||||
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
|
||||
}
|
||||
// update gpu data
|
||||
gpu := gm.GpuDataMap[id]
|
||||
gpu.Temperature = temp
|
||||
gpu.MemoryUsed = memoryUsage / mebibytesInAMegabyte
|
||||
gpu.MemoryTotal = totalMemory / mebibytesInAMegabyte
|
||||
gpu.Usage += usage
|
||||
gpu.Power += power
|
||||
gpu.Count++
|
||||
}
|
||||
return valid
|
||||
}
|
||||
|
||||
// parseAmdData parses the output of rocm-smi and updates the GPUData map
|
||||
func (gm *GPUManager) parseAmdData(output []byte) bool {
|
||||
var rocmSmiInfo map[string]RocmSmiJson
|
||||
if err := json.Unmarshal(output, &rocmSmiInfo); err != nil || len(rocmSmiInfo) == 0 {
|
||||
return false
|
||||
}
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
for _, v := range rocmSmiInfo {
|
||||
var power float64
|
||||
if v.PowerPackage != "" {
|
||||
power, _ = strconv.ParseFloat(v.PowerPackage, 64)
|
||||
} else {
|
||||
power, _ = strconv.ParseFloat(v.PowerSocket, 64)
|
||||
}
|
||||
memoryUsage, _ := strconv.ParseFloat(v.MemoryUsed, 64)
|
||||
totalMemory, _ := strconv.ParseFloat(v.MemoryTotal, 64)
|
||||
usage, _ := strconv.ParseFloat(v.Usage, 64)
|
||||
|
||||
if _, ok := gm.GpuDataMap[v.ID]; !ok {
|
||||
gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name}
|
||||
}
|
||||
gpu := gm.GpuDataMap[v.ID]
|
||||
gpu.Temperature, _ = strconv.ParseFloat(v.Temperature, 64)
|
||||
gpu.MemoryUsed = bytesToMegabytes(memoryUsage)
|
||||
gpu.MemoryTotal = bytesToMegabytes(totalMemory)
|
||||
gpu.Usage += usage
|
||||
gpu.Power += power
|
||||
gpu.Count++
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// sums and resets the current GPU utilization data since the last update
|
||||
func (gm *GPUManager) GetCurrentData() map[string]system.GPUData {
|
||||
gm.Lock()
|
||||
defer gm.Unlock()
|
||||
|
||||
// check for GPUs with the same name
|
||||
nameCounts := make(map[string]int)
|
||||
for _, gpu := range gm.GpuDataMap {
|
||||
nameCounts[gpu.Name]++
|
||||
}
|
||||
|
||||
// copy / reset the data
|
||||
gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap))
|
||||
for id, gpu := range gm.GpuDataMap {
|
||||
gpuAvg := *gpu
|
||||
|
||||
gpuAvg.Temperature = twoDecimals(gpu.Temperature)
|
||||
gpuAvg.MemoryUsed = twoDecimals(gpu.MemoryUsed)
|
||||
gpuAvg.MemoryTotal = twoDecimals(gpu.MemoryTotal)
|
||||
|
||||
// avoid division by zero
|
||||
if gpu.Count > 0 {
|
||||
gpuAvg.Usage = twoDecimals(gpu.Usage / gpu.Count)
|
||||
gpuAvg.Power = twoDecimals(gpu.Power / gpu.Count)
|
||||
}
|
||||
|
||||
// reset accumulators in the original
|
||||
gpu.Usage, gpu.Power, gpu.Count = 0, 0, 0
|
||||
|
||||
// append id to the name if there are multiple GPUs with the same name
|
||||
if nameCounts[gpu.Name] > 1 {
|
||||
gpuAvg.Name = fmt.Sprintf("%s %s", gpu.Name, id)
|
||||
}
|
||||
gpuData[id] = gpuAvg
|
||||
}
|
||||
slog.Debug("GPU", "data", gpuData)
|
||||
return gpuData
|
||||
}
|
||||
|
||||
// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats)
|
||||
// in the system path. It sets the corresponding flags in the GPUManager struct if any of these
|
||||
// tools are found. If none of the tools are found, it returns an error indicating that no GPU
|
||||
// management tools are available.
|
||||
func (gm *GPUManager) detectGPUs() error {
|
||||
if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
|
||||
gm.nvidiaSmi = true
|
||||
}
|
||||
if _, err := exec.LookPath(rocmSmiCmd); err == nil {
|
||||
gm.rocmSmi = true
|
||||
}
|
||||
if _, err := exec.LookPath(tegraStatsCmd); err == nil {
|
||||
gm.tegrastats = true
|
||||
gm.nvidiaSmi = false
|
||||
}
|
||||
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
|
||||
}
|
||||
|
||||
// startCollector starts the appropriate GPU data collector based on the command
|
||||
func (gm *GPUManager) startCollector(command string) {
|
||||
collector := gpuCollector{
|
||||
name: command,
|
||||
}
|
||||
switch command {
|
||||
case nvidiaSmiCmd:
|
||||
collector.cmdArgs = []string{
|
||||
"-l", nvidiaSmiInterval,
|
||||
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
collector.parse = gm.parseNvidiaData
|
||||
go collector.start()
|
||||
case tegraStatsCmd:
|
||||
collector.cmdArgs = []string{"--interval", tegraStatsInterval}
|
||||
collector.parse = gm.getJetsonParser()
|
||||
go collector.start()
|
||||
case rocmSmiCmd:
|
||||
collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
|
||||
collector.parse = gm.parseAmdData
|
||||
go func() {
|
||||
failures := 0
|
||||
for {
|
||||
if err := collector.collect(); err != nil {
|
||||
failures++
|
||||
if failures > maxFailureRetries {
|
||||
break
|
||||
}
|
||||
slog.Warn("Error collecting AMD GPU data", "err", err)
|
||||
}
|
||||
time.Sleep(rocmSmiInterval)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// NewGPUManager creates and initializes a new GPUManager
|
||||
func NewGPUManager() (*GPUManager, error) {
|
||||
var gm GPUManager
|
||||
if err := gm.detectGPUs(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
gm.GpuDataMap = make(map[string]*system.GPUData)
|
||||
|
||||
if gm.nvidiaSmi {
|
||||
gm.startCollector(nvidiaSmiCmd)
|
||||
}
|
||||
if gm.rocmSmi {
|
||||
gm.startCollector(rocmSmiCmd)
|
||||
}
|
||||
if gm.tegrastats {
|
||||
gm.startCollector(tegraStatsCmd)
|
||||
}
|
||||
|
||||
return &gm, nil
|
||||
}
|
Reference in New Issue
Block a user