Compare commits

..

6 Commits

Author SHA1 Message Date
Akizon
9da02f615f Merge pull request #34 from fankes/main
feat: 添加自定义 DNS 解析器
2025-09-17 16:52:58 +08:00
Akizon
6bdf718dc0 Merge branch 'main' into main 2025-09-17 16:52:51 +08:00
Akizon
518f782185 Merge pull request #35 from kdwycz/dev
新增Linux GPU监控功能
2025-09-17 16:50:12 +08:00
Akizon77
396fe5cfc2 fix: 排除hugetlbfs 2025-09-16 00:28:58 +08:00
kdwycz
ad3c02c22c Merge branch 'main' into dev 2025-09-15 20:23:56 +08:00
kdwycz
5304a68d5d 新增Linux GPU监控功能
参考了nezha-agent。实现了Linux服务器下NVIDIA显卡的监控。
通过 --gpu 参数启用显卡监控功能。
支持多显卡,显卡使用率,显存使用率监控
实现了AMD显卡的监控,但是未经过测试
2025-09-12 16:09:55 +08:00
10 changed files with 827 additions and 2 deletions

View File

@@ -20,4 +20,5 @@ var (
CFAccessClientSecret string
MemoryIncludeCache bool
CustomDNS string
EnableGPU bool // 启用详细GPU监控
)

View File

@@ -114,5 +114,6 @@ func init() {
RootCmd.PersistentFlags().StringVar(&flags.CFAccessClientSecret, "cf-access-client-secret", "", "Cloudflare Access Client Secret")
RootCmd.PersistentFlags().BoolVar(&flags.MemoryIncludeCache, "memory-include-cache", false, "Include cache/buffer in memory usage")
RootCmd.PersistentFlags().StringVar(&flags.CustomDNS, "custom-dns", "", "Custom DNS server to use (e.g. 8.8.8.8, 114.114.114.114). By default, the program will use multiple built-in DNS servers with failover support.")
RootCmd.PersistentFlags().BoolVar(&flags.EnableGPU, "gpu", false, "Enable detailed GPU monitoring (usage, memory, multi-GPU support)")
RootCmd.PersistentFlags().ParseErrorsWhitelist.UnknownFlags = true
}

View File

@@ -5,6 +5,7 @@ import (
"fmt"
"log"
"github.com/komari-monitor/komari-agent/cmd/flags"
monitoring "github.com/komari-monitor/komari-agent/monitoring/unit"
)
@@ -74,6 +75,46 @@ func GenerateReport() []byte {
processcount := monitoring.ProcessCount()
data["process"] = processcount
// GPU监控 - 根据标志决定详细程度
if flags.EnableGPU {
// 详细GPU监控模式
gpuInfo, err := monitoring.GetDetailedGPUInfo()
if err != nil {
message += fmt.Sprintf("failed to get detailed GPU info: %v\n", err)
// 降级到基础GPU信息
gpuNames, nameErr := monitoring.GetDetailedGPUHost()
if nameErr == nil && len(gpuNames) > 0 {
data["gpu"] = map[string]interface{}{
"models": gpuNames,
}
}
} else {
// 成功获取详细信息
gpuData := make([]map[string]interface{}, len(gpuInfo))
totalGPUUsage := 0.0
for i, info := range gpuInfo {
gpuData[i] = map[string]interface{}{
"name": info.Name,
"memory_total": info.MemoryTotal,
"memory_used": info.MemoryUsed,
"utilization": info.Utilization,
"temperature": info.Temperature,
}
totalGPUUsage += info.Utilization
}
avgGPUUsage := totalGPUUsage / float64(len(gpuInfo))
data["gpu"] = map[string]interface{}{
"count": len(gpuInfo),
"average_usage": avgGPUUsage,
"detailed_info": gpuData,
}
}
}
// 基础模式下GPU信息已在basicInfo中处理
data["message"] = message
s, err := json.Marshal(data)

View File

@@ -77,6 +77,7 @@ func isPhysicalDisk(part disk.PartitionStat) bool {
"/dev/mqueue",
"/etc/resolv.conf",
"/etc/host", // /etc/hosts,/etc/hostname
"/dev/hugepages",
}
for _, mp := range mountpointsToExclude {
if mountpoint == mp || strings.HasPrefix(mountpoint, mp) {
@@ -100,6 +101,7 @@ func isPhysicalDisk(part disk.PartitionStat) bool {
"sysfs",
"cgroup",
"mqueue",
"hugetlbfs",
}
for _, fs := range fstypeToExclude {
if fstype == fs || strings.HasPrefix(fstype, fs) {

View File

@@ -0,0 +1,246 @@
package monitoring
// Modified from https://github.com/influxdata/telegraf/blob/master/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go
// Original License: MIT
import (
"encoding/json"
"errors"
"os"
"os/exec"
"strconv"
"strings"
)
type ROCmSMI struct {
BinPath string
data []byte
}
// AMDGPUInfo AMD GPU详细信息
type AMDGPUInfo struct {
Name string // GPU型号
MemoryTotal uint64 // 总显存 (字节)
MemoryUsed uint64 // 已用显存 (字节)
Utilization float64 // GPU使用率 (0-100)
Temperature uint64 // 温度 (摄氏度)
}
// ROCmSMI JSON响应结构
type ROCmResponse map[string]ROCmGPUInfo
type ROCmGPUInfo struct {
CardSeries string `json:"Card series"`
GPUUsage string `json:"GPU use (%)"`
VRAMTotalMemory string `json:"VRAM Total Memory (B)"`
VRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"`
TemperatureJunction string `json:"Temperature (Sensor junction) (C)"`
}
func (rsmi *ROCmSMI) GatherModel() ([]string, error) {
return rsmi.gatherModel()
}
func (rsmi *ROCmSMI) GatherUsage() ([]float64, error) {
return rsmi.gatherUsage()
}
// GatherDetailedInfo 获取详细GPU信息
func (rsmi *ROCmSMI) GatherDetailedInfo() ([]AMDGPUInfo, error) {
return rsmi.gatherDetailedInfo()
}
func (rsmi *ROCmSMI) Start() error {
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
binPath, err := exec.LookPath("rocm-smi")
if err != nil {
return errors.New("rocm-smi tool not found")
}
rsmi.BinPath = binPath
}
rsmi.data = rsmi.pollROCmSMI()
return nil
}
func (rsmi *ROCmSMI) pollROCmSMI() []byte {
cmd := exec.Command(rsmi.BinPath, "--showallinfo", "--json")
output, err := cmd.CombinedOutput()
if err != nil {
return nil
}
return output
}
func (rsmi *ROCmSMI) gatherModel() ([]string, error) {
var data map[string]interface{}
var models []string
if err := json.Unmarshal(rsmi.data, &data); err != nil {
return nil, err
}
// 解析JSON结构获取GPU型号
for key, value := range data {
if strings.HasPrefix(key, "card") {
if cardData, ok := value.(map[string]interface{}); ok {
if name, exists := cardData["Card series"]; exists {
if nameStr, ok := name.(string); ok && nameStr != "" {
models = append(models, nameStr)
}
}
}
}
}
return models, nil
}
func (rsmi *ROCmSMI) gatherUsage() ([]float64, error) {
var data map[string]interface{}
var usageList []float64
if err := json.Unmarshal(rsmi.data, &data); err != nil {
return nil, err
}
// 解析JSON结构获取GPU使用率
for key, value := range data {
if strings.HasPrefix(key, "card") {
if cardData, ok := value.(map[string]interface{}); ok {
usage := 0.0
if utilizationData, exists := cardData["GPU use (%)"]; exists {
if utilizationStr, ok := utilizationData.(string); ok {
if parsed, err := parseAMDPercentage(utilizationStr); err == nil {
usage = parsed
}
}
}
usageList = append(usageList, usage)
}
}
}
return usageList, nil
}
func (rsmi *ROCmSMI) gatherDetailedInfo() ([]AMDGPUInfo, error) {
if rsmi.data == nil {
return nil, errors.New("no data available")
}
var data map[string]interface{}
var gpuInfos []AMDGPUInfo
if err := json.Unmarshal(rsmi.data, &data); err != nil {
return nil, err
}
// 解析每个GPU卡的详细信息
for key, value := range data {
if strings.HasPrefix(key, "card") {
if cardData, ok := value.(map[string]interface{}); ok {
gpuInfo := AMDGPUInfo{}
// 获取GPU名称
if name, exists := cardData["Card series"]; exists {
if nameStr, ok := name.(string); ok {
gpuInfo.Name = nameStr
}
}
// 获取使用率
if utilizationData, exists := cardData["GPU use (%)"]; exists {
if utilizationStr, ok := utilizationData.(string); ok {
if usage, err := parseAMDPercentage(utilizationStr); err == nil {
gpuInfo.Utilization = usage
}
}
}
// 获取显存信息
if memUsedData, exists := cardData["VRAM Total Used Memory (B)"]; exists {
if memUsedStr, ok := memUsedData.(string); ok {
if memUsed, err := parseAMDMemoryBytes(memUsedStr); err == nil {
gpuInfo.MemoryUsed = memUsed
}
}
}
if memTotalData, exists := cardData["VRAM Total Memory (B)"]; exists {
if memTotalStr, ok := memTotalData.(string); ok {
if memTotal, err := parseAMDMemoryBytes(memTotalStr); err == nil {
gpuInfo.MemoryTotal = memTotal
}
}
}
// 获取温度信息
if tempData, exists := cardData["Temperature (Sensor junction) (C)"]; exists {
if tempStr, ok := tempData.(string); ok {
if temp, err := parseAMDTemperature(tempStr); err == nil {
gpuInfo.Temperature = temp
}
}
}
gpuInfos = append(gpuInfos, gpuInfo)
}
}
}
return gpuInfos, nil
}
// 解析AMD百分比值 (例如 "25" -> 25.0)
func parseAMDPercentage(value string) (float64, error) {
cleaned := strings.TrimSpace(value)
cleaned = strings.TrimSuffix(cleaned, "%")
cleaned = strings.TrimSpace(cleaned)
if cleaned == "" {
return 0.0, nil
}
result, err := strconv.ParseFloat(cleaned, 64)
if err != nil {
return 0.0, err
}
return result, nil
}
// 解析AMD显存字节 (例如 "1073741824" -> 1073741824字节)
func parseAMDMemoryBytes(value string) (uint64, error) {
cleaned := strings.TrimSpace(value)
if cleaned == "" {
return 0, nil
}
bytes, err := strconv.ParseUint(cleaned, 10, 64)
if err != nil {
return 0, err
}
// 直接返回字节数
return bytes, nil
}
// 解析AMD温度值 (例如 "65" -> 65)
func parseAMDTemperature(value string) (uint64, error) {
cleaned := strings.TrimSpace(value)
cleaned = strings.TrimSuffix(cleaned, "C")
cleaned = strings.TrimSpace(cleaned)
if cleaned == "" {
return 0, nil
}
result, err := strconv.ParseUint(cleaned, 10, 64)
if err != nil {
return 0, err
}
return result, nil
}

View File

@@ -0,0 +1,31 @@
//go:build !linux
package monitoring
import (
"errors"
)
// DetailedGPUInfo 详细GPU信息结构体
type DetailedGPUInfo struct {
Name string `json:"name"` // GPU型号
MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节)
MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节)
Utilization float64 `json:"utilization"` // GPU使用率 (0-100)
Temperature uint64 `json:"temperature"` // 温度 (摄氏度)
}
// GetDetailedGPUHost 获取GPU型号信息 - 回退实现
func GetDetailedGPUHost() ([]string, error) {
return nil, errors.New("detailed GPU monitoring not supported on this platform")
}
// GetDetailedGPUState 获取GPU使用率 - 回退实现
func GetDetailedGPUState() ([]float64, error) {
return nil, errors.New("detailed GPU monitoring not supported on this platform")
}
// GetDetailedGPUInfo 获取详细GPU信息 - 回退实现
func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) {
return nil, errors.New("detailed GPU monitoring not supported on this platform")
}

View File

@@ -0,0 +1,213 @@
//go:build linux
package monitoring
import (
"errors"
)
const (
vendorAMD = iota + 1
vendorNVIDIA
)
var vendorType = getDetailedVendor()
// DetailedGPUInfo 详细GPU信息结构体
type DetailedGPUInfo struct {
Name string `json:"name"` // GPU型号
MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节)
MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节)
Utilization float64 `json:"utilization"` // GPU使用率 (0-100)
Temperature uint64 `json:"temperature"` // 温度 (摄氏度)
}
func getDetailedVendor() uint8 {
_, err := getNvidiaDetailedStat()
if err != nil {
return vendorAMD
} else {
return vendorNVIDIA
}
}
func getNvidiaDetailedStat() ([]float64, error) {
smi := &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err1 := smi.Start()
if err1 != nil {
return nil, err1
}
data, err2 := smi.GatherUsage()
if err2 != nil {
return nil, err2
}
return data, nil
}
func getAMDDetailedStat() ([]float64, error) {
rsmi := &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherUsage()
if err != nil {
return nil, err
}
return data, nil
}
func getNvidiaDetailedHost() ([]string, error) {
smi := &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err := smi.Start()
if err != nil {
return nil, err
}
data, err := smi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}
func getAMDDetailedHost() ([]string, error) {
rsmi := &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}
// GetDetailedGPUHost 获取GPU型号信息
func GetDetailedGPUHost() ([]string, error) {
var gi []string
var err error
switch vendorType {
case vendorAMD:
gi, err = getAMDDetailedHost()
case vendorNVIDIA:
gi, err = getNvidiaDetailedHost()
default:
return nil, errors.New("invalid vendor")
}
if err != nil {
return nil, err
}
return gi, nil
}
// GetDetailedGPUState 获取GPU使用率
func GetDetailedGPUState() ([]float64, error) {
var gs []float64
var err error
switch vendorType {
case vendorAMD:
gs, err = getAMDDetailedStat()
case vendorNVIDIA:
gs, err = getNvidiaDetailedStat()
default:
return nil, errors.New("invalid vendor")
}
if err != nil {
return nil, err
}
return gs, nil
}
// GetDetailedGPUInfo 获取详细GPU信息
func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) {
var gpuInfos []DetailedGPUInfo
var err error
switch vendorType {
case vendorAMD:
gpuInfos, err = getAMDDetailedInfo()
case vendorNVIDIA:
gpuInfos, err = getNvidiaDetailedInfo()
default:
return nil, errors.New("invalid vendor")
}
if err != nil {
return nil, err
}
return gpuInfos, nil
}
func getNvidiaDetailedInfo() ([]DetailedGPUInfo, error) {
smi := &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err := smi.Start()
if err != nil {
return nil, err
}
data, err := smi.GatherDetailedInfo()
if err != nil {
return nil, err
}
var gpuInfos []DetailedGPUInfo
for _, nvidiaInfo := range data {
gpuInfo := DetailedGPUInfo{
Name: nvidiaInfo.Name,
MemoryTotal: nvidiaInfo.MemoryTotal,
MemoryUsed: nvidiaInfo.MemoryUsed,
Utilization: nvidiaInfo.Utilization,
Temperature: nvidiaInfo.Temperature,
}
gpuInfos = append(gpuInfos, gpuInfo)
}
return gpuInfos, nil
}
func getAMDDetailedInfo() ([]DetailedGPUInfo, error) {
rsmi := &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherDetailedInfo()
if err != nil {
return nil, err
}
var gpuInfos []DetailedGPUInfo
for _, amdInfo := range data {
gpuInfo := DetailedGPUInfo{
Name: amdInfo.Name,
MemoryTotal: amdInfo.MemoryTotal,
MemoryUsed: amdInfo.MemoryUsed,
Utilization: amdInfo.Utilization,
Temperature: amdInfo.Temperature,
}
gpuInfos = append(gpuInfos, gpuInfo)
}
return gpuInfos, nil
}

View File

@@ -0,0 +1,67 @@
package monitoring
import (
"testing"
)
func TestDetailedGPUDetection(t *testing.T) {
models, err := GetDetailedGPUHost()
if err != nil {
t.Logf("Detailed GPU detection failed (may be normal on non-Linux or non-GPU systems): %v", err)
return
}
t.Logf("Detected GPUs: %v", models)
if len(models) > 0 {
usage, err := GetDetailedGPUState()
if err != nil {
t.Logf("GPU state collection failed: %v", err)
} else {
t.Logf("GPU usage: %v", usage)
}
// 测试详细信息获取
detailedInfo, err := GetDetailedGPUInfo()
if err != nil {
t.Logf("GPU detailed info collection failed: %v", err)
} else {
for i, info := range detailedInfo {
t.Logf("GPU %d: %s - Memory: %dMB/%dMB, Usage: %.1f%%, Temp: %d°C",
i, info.Name, info.MemoryUsed, info.MemoryTotal, info.Utilization, info.Temperature)
}
}
}
}
func TestDetailedGPUInfo(t *testing.T) {
detailedInfo, err := GetDetailedGPUInfo()
if err != nil {
t.Logf("GPU detailed info test failed (may be normal): %v", err)
return
}
if len(detailedInfo) == 0 {
t.Log("No detailed GPU info available")
return
}
for i, info := range detailedInfo {
t.Logf("GPU %d Details:", i)
t.Logf(" Name: %s", info.Name)
t.Logf(" Memory Total: %d MB", info.MemoryTotal)
t.Logf(" Memory Used: %d MB", info.MemoryUsed)
t.Logf(" Memory Free: %d MB", info.MemoryFree)
t.Logf(" Utilization: %.1f%%", info.Utilization)
t.Logf(" Temperature: %d°C", info.Temperature)
// 验证数据的合理性
if info.MemoryTotal > 0 && info.MemoryUsed+info.MemoryFree != info.MemoryTotal {
t.Logf("Warning: Memory usage calculation may be inconsistent for %s", info.Name)
}
if info.Utilization < 0 || info.Utilization > 100 {
t.Errorf("Invalid utilization value for %s: %.1f%%", info.Name, info.Utilization)
}
}
}

View File

@@ -9,13 +9,26 @@ import (
)
func GpuName() string {
accept := []string{"vga", "nvidia", "amd", "radeon", "render"}
// 调整优先级:专用显卡厂商优先,避免只识别集成显卡
accept := []string{"nvidia", "amd", "radeon", "vga", "3d"}
out, err := exec.Command("lspci").Output()
if err == nil {
lines := strings.Split(string(out), "\n")
// 首先尝试找专用显卡
for _, line := range lines {
lower := strings.ToLower(line)
// 跳过集成显卡和管理控制器
if strings.Contains(lower, "aspeed") ||
strings.Contains(lower, "matrox") ||
strings.Contains(lower, "management") {
continue
}
// 优先匹配专用显卡厂商
for _, a := range accept {
if strings.Contains(strings.ToLower(line), a) {
if strings.Contains(lower, a) {
parts := strings.SplitN(line, ":", 4)
if len(parts) >= 4 {
return strings.TrimSpace(parts[3])
@@ -27,6 +40,16 @@ func GpuName() string {
}
}
}
// 如果没有找到专用显卡返回第一个VGA设备作为兜底
for _, line := range lines {
if strings.Contains(strings.ToLower(line), "vga") {
parts := strings.SplitN(line, ":", 4)
if len(parts) >= 3 {
return strings.TrimSpace(parts[2])
}
}
}
}
return "None"
}

View File

@@ -0,0 +1,200 @@
package monitoring
// Modified from https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nvidia_smi/nvidia_smi.go
// Original License: MIT
import (
"encoding/xml"
"errors"
"os"
"os/exec"
"strconv"
"strings"
)
type NvidiaSMI struct {
BinPath string
data []byte
}
// NVIDIAGPUInfo 包含详细的NVIDIA GPU信息
type NVIDIAGPUInfo struct {
Name string // GPU型号
MemoryTotal uint64 // 总显存 (字节)
MemoryUsed uint64 // 已用显存 (字节)
Utilization float64 // GPU使用率 (0-100)
Temperature uint64 // 温度 (摄氏度)
}
func (smi *NvidiaSMI) GatherModel() ([]string, error) {
return smi.gatherModel()
}
func (smi *NvidiaSMI) GatherUsage() ([]float64, error) {
return smi.gatherUsage()
}
// GatherDetailedInfo 获取详细GPU信息
func (smi *NvidiaSMI) GatherDetailedInfo() ([]NVIDIAGPUInfo, error) {
return smi.gatherDetailedInfo()
}
func (smi *NvidiaSMI) Start() error {
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
binPath, err := exec.LookPath("nvidia-smi")
if err != nil {
return errors.New("nvidia-smi tool not found")
}
smi.BinPath = binPath
}
smi.data = smi.pollNvidiaSMI()
return nil
}
func (smi *NvidiaSMI) pollNvidiaSMI() []byte {
cmd := exec.Command(smi.BinPath, "-q", "-x")
output, err := cmd.CombinedOutput()
if err != nil {
return nil
}
return output
}
func (smi *NvidiaSMI) gatherModel() ([]string, error) {
var stats nvidiaSMIXMLResult
var models []string
if err := xml.Unmarshal(smi.data, &stats); err != nil {
return nil, err
}
for _, gpu := range stats.GPUs {
if gpu.ProductName != "" {
models = append(models, gpu.ProductName)
}
}
return models, nil
}
func (smi *NvidiaSMI) gatherUsage() ([]float64, error) {
var stats nvidiaSMIXMLResult
var usageList []float64
if err := xml.Unmarshal(smi.data, &stats); err != nil {
return nil, err
}
for _, gpu := range stats.GPUs {
usage, err := parsePercentageValue(gpu.Utilization.GPUUtil)
if err != nil {
usage = 0.0 // 默认为0不中断处理
}
usageList = append(usageList, usage)
}
return usageList, nil
}
func (smi *NvidiaSMI) gatherDetailedInfo() ([]NVIDIAGPUInfo, error) {
var stats nvidiaSMIXMLResult
var gpuInfos []NVIDIAGPUInfo
if err := xml.Unmarshal(smi.data, &stats); err != nil {
return nil, err
}
for _, gpu := range stats.GPUs {
utilization, _ := parsePercentageValue(gpu.Utilization.GPUUtil)
memTotal, _ := parseMemoryValue(gpu.FrameBufferMemoryUsage.Total)
memUsed, _ := parseMemoryValue(gpu.FrameBufferMemoryUsage.Used)
temp, _ := parseTemperatureValue(gpu.Temperature.GPUTemp)
gpuInfo := NVIDIAGPUInfo{
Name: gpu.ProductName,
MemoryTotal: memTotal,
MemoryUsed: memUsed,
Utilization: utilization,
Temperature: temp,
}
gpuInfos = append(gpuInfos, gpuInfo)
}
return gpuInfos, nil
}
// 解析百分比值 (例如 "25 %" -> 25.0)
func parsePercentageValue(value string) (float64, error) {
cleaned := strings.TrimSpace(value)
cleaned = strings.TrimSuffix(cleaned, "%")
cleaned = strings.TrimSpace(cleaned)
if cleaned == "" {
return 0.0, nil
}
result, err := strconv.ParseFloat(cleaned, 64)
if err != nil {
return 0.0, err
}
return result, nil
}
// 解析内存值 (例如 "1024 MiB" -> 1073741824字节)
func parseMemoryValue(value string) (uint64, error) {
cleaned := strings.TrimSpace(value)
cleaned = strings.TrimSuffix(cleaned, "MiB")
cleaned = strings.TrimSpace(cleaned)
if cleaned == "" {
return 0, nil
}
result, err := strconv.ParseUint(cleaned, 10, 64)
if err != nil {
return 0, err
}
// 转换MiB为字节 (1 MiB = 1024*1024 bytes)
return result * 1024 * 1024, nil
}
// 解析温度值 (例如 "65 C" -> 65)
func parseTemperatureValue(value string) (uint64, error) {
cleaned := strings.TrimSpace(value)
cleaned = strings.TrimSuffix(cleaned, "C")
cleaned = strings.TrimSpace(cleaned)
if cleaned == "" {
return 0, nil
}
result, err := strconv.ParseUint(cleaned, 10, 64)
if err != nil {
return 0, err
}
return result, nil
}
// NVIDIA-SMI XML结构定义
type nvidiaSMIXMLResult struct {
GPUs []nvidiaSMIGPU `xml:"gpu"`
}
type nvidiaSMIGPU struct {
ProductName string `xml:"product_name"`
Utilization struct {
GPUUtil string `xml:"gpu_util"`
} `xml:"utilization"`
FrameBufferMemoryUsage struct {
Total string `xml:"total"`
Used string `xml:"used"`
Free string `xml:"free"`
} `xml:"fb_memory_usage"`
Temperature struct {
GPUTemp string `xml:"gpu_temp"`
} `xml:"temperature"`
}