Files
komari-agent/monitoring/unit/gpu_detailed_linux.go
kdwycz 5304a68d5d 新增Linux GPU监控功能
参考了nezha-agent。实现了Linux服务器下NVIDIA显卡的监控。
通过 --gpu 参数启用显卡监控功能。
支持多显卡,显卡使用率,显存使用率监控
实现了AMD显卡的监控,但是未经过测试
2025-09-12 16:09:55 +08:00

213 lines
4.0 KiB
Go

//go:build linux
package monitoring
import (
"errors"
)
const (
vendorAMD = iota + 1
vendorNVIDIA
)
var vendorType = getDetailedVendor()
// DetailedGPUInfo 详细GPU信息结构体
type DetailedGPUInfo struct {
Name string `json:"name"` // GPU型号
MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节)
MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节)
Utilization float64 `json:"utilization"` // GPU使用率 (0-100)
Temperature uint64 `json:"temperature"` // 温度 (摄氏度)
}
func getDetailedVendor() uint8 {
_, err := getNvidiaDetailedStat()
if err != nil {
return vendorAMD
} else {
return vendorNVIDIA
}
}
func getNvidiaDetailedStat() ([]float64, error) {
smi := &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err1 := smi.Start()
if err1 != nil {
return nil, err1
}
data, err2 := smi.GatherUsage()
if err2 != nil {
return nil, err2
}
return data, nil
}
func getAMDDetailedStat() ([]float64, error) {
rsmi := &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherUsage()
if err != nil {
return nil, err
}
return data, nil
}
func getNvidiaDetailedHost() ([]string, error) {
smi := &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err := smi.Start()
if err != nil {
return nil, err
}
data, err := smi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}
func getAMDDetailedHost() ([]string, error) {
rsmi := &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}
// GetDetailedGPUHost 获取GPU型号信息
func GetDetailedGPUHost() ([]string, error) {
var gi []string
var err error
switch vendorType {
case vendorAMD:
gi, err = getAMDDetailedHost()
case vendorNVIDIA:
gi, err = getNvidiaDetailedHost()
default:
return nil, errors.New("invalid vendor")
}
if err != nil {
return nil, err
}
return gi, nil
}
// GetDetailedGPUState 获取GPU使用率
func GetDetailedGPUState() ([]float64, error) {
var gs []float64
var err error
switch vendorType {
case vendorAMD:
gs, err = getAMDDetailedStat()
case vendorNVIDIA:
gs, err = getNvidiaDetailedStat()
default:
return nil, errors.New("invalid vendor")
}
if err != nil {
return nil, err
}
return gs, nil
}
// GetDetailedGPUInfo 获取详细GPU信息
func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) {
var gpuInfos []DetailedGPUInfo
var err error
switch vendorType {
case vendorAMD:
gpuInfos, err = getAMDDetailedInfo()
case vendorNVIDIA:
gpuInfos, err = getNvidiaDetailedInfo()
default:
return nil, errors.New("invalid vendor")
}
if err != nil {
return nil, err
}
return gpuInfos, nil
}
func getNvidiaDetailedInfo() ([]DetailedGPUInfo, error) {
smi := &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err := smi.Start()
if err != nil {
return nil, err
}
data, err := smi.GatherDetailedInfo()
if err != nil {
return nil, err
}
var gpuInfos []DetailedGPUInfo
for _, nvidiaInfo := range data {
gpuInfo := DetailedGPUInfo{
Name: nvidiaInfo.Name,
MemoryTotal: nvidiaInfo.MemoryTotal,
MemoryUsed: nvidiaInfo.MemoryUsed,
Utilization: nvidiaInfo.Utilization,
Temperature: nvidiaInfo.Temperature,
}
gpuInfos = append(gpuInfos, gpuInfo)
}
return gpuInfos, nil
}
func getAMDDetailedInfo() ([]DetailedGPUInfo, error) {
rsmi := &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherDetailedInfo()
if err != nil {
return nil, err
}
var gpuInfos []DetailedGPUInfo
for _, amdInfo := range data {
gpuInfo := DetailedGPUInfo{
Name: amdInfo.Name,
MemoryTotal: amdInfo.MemoryTotal,
MemoryUsed: amdInfo.MemoryUsed,
Utilization: amdInfo.Utilization,
Temperature: amdInfo.Temperature,
}
gpuInfos = append(gpuInfos, gpuInfo)
}
return gpuInfos, nil
}