diff --git a/cmd/flags/flag.go b/cmd/flags/flag.go index a415bbf..fe57c68 100644 --- a/cmd/flags/flag.go +++ b/cmd/flags/flag.go @@ -19,4 +19,5 @@ var ( CFAccessClientID string CFAccessClientSecret string MemoryIncludeCache bool + EnableGPU bool // 启用详细GPU监控 ) diff --git a/cmd/root.go b/cmd/root.go index d71d721..0ea230e 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -100,5 +100,6 @@ func init() { RootCmd.PersistentFlags().StringVar(&flags.CFAccessClientID, "cf-access-client-id", "", "Cloudflare Access Client ID") RootCmd.PersistentFlags().StringVar(&flags.CFAccessClientSecret, "cf-access-client-secret", "", "Cloudflare Access Client Secret") RootCmd.PersistentFlags().BoolVar(&flags.MemoryIncludeCache, "memory-include-cache", false, "Include cache/buffer in memory usage") + RootCmd.PersistentFlags().BoolVar(&flags.EnableGPU, "gpu", false, "Enable detailed GPU monitoring (usage, memory, multi-GPU support)") RootCmd.PersistentFlags().ParseErrorsWhitelist.UnknownFlags = true } diff --git a/monitoring/monitoring.go b/monitoring/monitoring.go index 5926c3b..48f21e5 100644 --- a/monitoring/monitoring.go +++ b/monitoring/monitoring.go @@ -5,6 +5,7 @@ import ( "fmt" "log" + "github.com/komari-monitor/komari-agent/cmd/flags" monitoring "github.com/komari-monitor/komari-agent/monitoring/unit" ) @@ -74,6 +75,46 @@ func GenerateReport() []byte { processcount := monitoring.ProcessCount() data["process"] = processcount + // GPU监控 - 根据标志决定详细程度 + if flags.EnableGPU { + // 详细GPU监控模式 + gpuInfo, err := monitoring.GetDetailedGPUInfo() + if err != nil { + message += fmt.Sprintf("failed to get detailed GPU info: %v\n", err) + // 降级到基础GPU信息 + gpuNames, nameErr := monitoring.GetDetailedGPUHost() + if nameErr == nil && len(gpuNames) > 0 { + data["gpu"] = map[string]interface{}{ + "models": gpuNames, + } + } + } else { + // 成功获取详细信息 + gpuData := make([]map[string]interface{}, len(gpuInfo)) + totalGPUUsage := 0.0 + + for i, info := range gpuInfo { + gpuData[i] = map[string]interface{}{ + "name": info.Name, + "memory_total": info.MemoryTotal, + "memory_used": info.MemoryUsed, + "utilization": info.Utilization, + "temperature": info.Temperature, + } + totalGPUUsage += info.Utilization + } + + avgGPUUsage := totalGPUUsage / float64(len(gpuInfo)) + + data["gpu"] = map[string]interface{}{ + "count": len(gpuInfo), + "average_usage": avgGPUUsage, + "detailed_info": gpuData, + } + } + } + // 基础模式下,GPU信息已在basicInfo中处理 + data["message"] = message s, err := json.Marshal(data) diff --git a/monitoring/unit/gpu_amd_rocm_smi.go b/monitoring/unit/gpu_amd_rocm_smi.go new file mode 100644 index 0000000..03a06c8 --- /dev/null +++ b/monitoring/unit/gpu_amd_rocm_smi.go @@ -0,0 +1,246 @@ +package monitoring + +// Modified from https://github.com/influxdata/telegraf/blob/master/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go +// Original License: MIT + +import ( + "encoding/json" + "errors" + "os" + "os/exec" + "strconv" + "strings" +) + +type ROCmSMI struct { + BinPath string + data []byte +} + +// AMDGPUInfo AMD GPU详细信息 +type AMDGPUInfo struct { + Name string // GPU型号 + MemoryTotal uint64 // 总显存 (字节) + MemoryUsed uint64 // 已用显存 (字节) + Utilization float64 // GPU使用率 (0-100) + Temperature uint64 // 温度 (摄氏度) +} + +// ROCmSMI JSON响应结构 +type ROCmResponse map[string]ROCmGPUInfo + +type ROCmGPUInfo struct { + CardSeries string `json:"Card series"` + GPUUsage string `json:"GPU use (%)"` + VRAMTotalMemory string `json:"VRAM Total Memory (B)"` + VRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"` + TemperatureJunction string `json:"Temperature (Sensor junction) (C)"` +} + +func (rsmi *ROCmSMI) GatherModel() ([]string, error) { + return rsmi.gatherModel() +} + +func (rsmi *ROCmSMI) GatherUsage() ([]float64, error) { + return rsmi.gatherUsage() +} + +// GatherDetailedInfo 获取详细GPU信息 +func (rsmi *ROCmSMI) GatherDetailedInfo() ([]AMDGPUInfo, error) { + return rsmi.gatherDetailedInfo() +} + +func (rsmi *ROCmSMI) Start() error { + if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) { + binPath, err := exec.LookPath("rocm-smi") + if err != nil { + return errors.New("rocm-smi tool not found") + } + rsmi.BinPath = binPath + } + + rsmi.data = rsmi.pollROCmSMI() + return nil +} + +func (rsmi *ROCmSMI) pollROCmSMI() []byte { + cmd := exec.Command(rsmi.BinPath, "--showallinfo", "--json") + output, err := cmd.CombinedOutput() + if err != nil { + return nil + } + return output +} + +func (rsmi *ROCmSMI) gatherModel() ([]string, error) { + var data map[string]interface{} + var models []string + + if err := json.Unmarshal(rsmi.data, &data); err != nil { + return nil, err + } + + // 解析JSON结构获取GPU型号 + for key, value := range data { + if strings.HasPrefix(key, "card") { + if cardData, ok := value.(map[string]interface{}); ok { + if name, exists := cardData["Card series"]; exists { + if nameStr, ok := name.(string); ok && nameStr != "" { + models = append(models, nameStr) + } + } + } + } + } + + return models, nil +} + +func (rsmi *ROCmSMI) gatherUsage() ([]float64, error) { + var data map[string]interface{} + var usageList []float64 + + if err := json.Unmarshal(rsmi.data, &data); err != nil { + return nil, err + } + + // 解析JSON结构获取GPU使用率 + for key, value := range data { + if strings.HasPrefix(key, "card") { + if cardData, ok := value.(map[string]interface{}); ok { + usage := 0.0 + if utilizationData, exists := cardData["GPU use (%)"]; exists { + if utilizationStr, ok := utilizationData.(string); ok { + if parsed, err := parseAMDPercentage(utilizationStr); err == nil { + usage = parsed + } + } + } + usageList = append(usageList, usage) + } + } + } + + return usageList, nil +} + +func (rsmi *ROCmSMI) gatherDetailedInfo() ([]AMDGPUInfo, error) { + if rsmi.data == nil { + return nil, errors.New("no data available") + } + + var data map[string]interface{} + var gpuInfos []AMDGPUInfo + + if err := json.Unmarshal(rsmi.data, &data); err != nil { + return nil, err + } + + // 解析每个GPU卡的详细信息 + for key, value := range data { + if strings.HasPrefix(key, "card") { + if cardData, ok := value.(map[string]interface{}); ok { + gpuInfo := AMDGPUInfo{} + + // 获取GPU名称 + if name, exists := cardData["Card series"]; exists { + if nameStr, ok := name.(string); ok { + gpuInfo.Name = nameStr + } + } + + // 获取使用率 + if utilizationData, exists := cardData["GPU use (%)"]; exists { + if utilizationStr, ok := utilizationData.(string); ok { + if usage, err := parseAMDPercentage(utilizationStr); err == nil { + gpuInfo.Utilization = usage + } + } + } + + // 获取显存信息 + if memUsedData, exists := cardData["VRAM Total Used Memory (B)"]; exists { + if memUsedStr, ok := memUsedData.(string); ok { + if memUsed, err := parseAMDMemoryBytes(memUsedStr); err == nil { + gpuInfo.MemoryUsed = memUsed + } + } + } + + if memTotalData, exists := cardData["VRAM Total Memory (B)"]; exists { + if memTotalStr, ok := memTotalData.(string); ok { + if memTotal, err := parseAMDMemoryBytes(memTotalStr); err == nil { + gpuInfo.MemoryTotal = memTotal + } + } + } + + // 获取温度信息 + if tempData, exists := cardData["Temperature (Sensor junction) (C)"]; exists { + if tempStr, ok := tempData.(string); ok { + if temp, err := parseAMDTemperature(tempStr); err == nil { + gpuInfo.Temperature = temp + } + } + } + + gpuInfos = append(gpuInfos, gpuInfo) + } + } + } + + return gpuInfos, nil +} + +// 解析AMD百分比值 (例如 "25" -> 25.0) +func parseAMDPercentage(value string) (float64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "%") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0.0, nil + } + + result, err := strconv.ParseFloat(cleaned, 64) + if err != nil { + return 0.0, err + } + + return result, nil +} + +// 解析AMD显存字节 (例如 "1073741824" -> 1073741824字节) +func parseAMDMemoryBytes(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + + if cleaned == "" { + return 0, nil + } + + bytes, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + // 直接返回字节数 + return bytes, nil +} + +// 解析AMD温度值 (例如 "65" -> 65) +func parseAMDTemperature(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "C") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0, nil + } + + result, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + return result, nil +} diff --git a/monitoring/unit/gpu_detailed_fallback.go b/monitoring/unit/gpu_detailed_fallback.go new file mode 100644 index 0000000..84954a1 --- /dev/null +++ b/monitoring/unit/gpu_detailed_fallback.go @@ -0,0 +1,31 @@ +//go:build !linux + +package monitoring + +import ( + "errors" +) + +// DetailedGPUInfo 详细GPU信息结构体 +type DetailedGPUInfo struct { + Name string `json:"name"` // GPU型号 + MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节) + MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节) + Utilization float64 `json:"utilization"` // GPU使用率 (0-100) + Temperature uint64 `json:"temperature"` // 温度 (摄氏度) +} + +// GetDetailedGPUHost 获取GPU型号信息 - 回退实现 +func GetDetailedGPUHost() ([]string, error) { + return nil, errors.New("detailed GPU monitoring not supported on this platform") +} + +// GetDetailedGPUState 获取GPU使用率 - 回退实现 +func GetDetailedGPUState() ([]float64, error) { + return nil, errors.New("detailed GPU monitoring not supported on this platform") +} + +// GetDetailedGPUInfo 获取详细GPU信息 - 回退实现 +func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) { + return nil, errors.New("detailed GPU monitoring not supported on this platform") +} diff --git a/monitoring/unit/gpu_detailed_linux.go b/monitoring/unit/gpu_detailed_linux.go new file mode 100644 index 0000000..087f07c --- /dev/null +++ b/monitoring/unit/gpu_detailed_linux.go @@ -0,0 +1,213 @@ +//go:build linux + +package monitoring + +import ( + "errors" +) + +const ( + vendorAMD = iota + 1 + vendorNVIDIA +) + +var vendorType = getDetailedVendor() + +// DetailedGPUInfo 详细GPU信息结构体 +type DetailedGPUInfo struct { + Name string `json:"name"` // GPU型号 + MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节) + MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节) + Utilization float64 `json:"utilization"` // GPU使用率 (0-100) + Temperature uint64 `json:"temperature"` // 温度 (摄氏度) +} + +func getDetailedVendor() uint8 { + _, err := getNvidiaDetailedStat() + if err != nil { + return vendorAMD + } else { + return vendorNVIDIA + } +} + +func getNvidiaDetailedStat() ([]float64, error) { + smi := &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + } + err1 := smi.Start() + if err1 != nil { + return nil, err1 + } + data, err2 := smi.GatherUsage() + if err2 != nil { + return nil, err2 + } + return data, nil +} + +func getAMDDetailedStat() ([]float64, error) { + rsmi := &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + } + err := rsmi.Start() + if err != nil { + return nil, err + } + data, err := rsmi.GatherUsage() + if err != nil { + return nil, err + } + return data, nil +} + +func getNvidiaDetailedHost() ([]string, error) { + smi := &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + } + err := smi.Start() + if err != nil { + return nil, err + } + data, err := smi.GatherModel() + if err != nil { + return nil, err + } + return data, nil +} + +func getAMDDetailedHost() ([]string, error) { + rsmi := &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + } + err := rsmi.Start() + if err != nil { + return nil, err + } + data, err := rsmi.GatherModel() + if err != nil { + return nil, err + } + return data, nil +} + +// GetDetailedGPUHost 获取GPU型号信息 +func GetDetailedGPUHost() ([]string, error) { + var gi []string + var err error + + switch vendorType { + case vendorAMD: + gi, err = getAMDDetailedHost() + case vendorNVIDIA: + gi, err = getNvidiaDetailedHost() + default: + return nil, errors.New("invalid vendor") + } + + if err != nil { + return nil, err + } + + return gi, nil +} + +// GetDetailedGPUState 获取GPU使用率 +func GetDetailedGPUState() ([]float64, error) { + var gs []float64 + var err error + + switch vendorType { + case vendorAMD: + gs, err = getAMDDetailedStat() + case vendorNVIDIA: + gs, err = getNvidiaDetailedStat() + default: + return nil, errors.New("invalid vendor") + } + + if err != nil { + return nil, err + } + + return gs, nil +} + +// GetDetailedGPUInfo 获取详细GPU信息 +func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) { + var gpuInfos []DetailedGPUInfo + var err error + + switch vendorType { + case vendorAMD: + gpuInfos, err = getAMDDetailedInfo() + case vendorNVIDIA: + gpuInfos, err = getNvidiaDetailedInfo() + default: + return nil, errors.New("invalid vendor") + } + + if err != nil { + return nil, err + } + + return gpuInfos, nil +} + +func getNvidiaDetailedInfo() ([]DetailedGPUInfo, error) { + smi := &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + } + err := smi.Start() + if err != nil { + return nil, err + } + + data, err := smi.GatherDetailedInfo() + if err != nil { + return nil, err + } + + var gpuInfos []DetailedGPUInfo + for _, nvidiaInfo := range data { + gpuInfo := DetailedGPUInfo{ + Name: nvidiaInfo.Name, + MemoryTotal: nvidiaInfo.MemoryTotal, + MemoryUsed: nvidiaInfo.MemoryUsed, + Utilization: nvidiaInfo.Utilization, + Temperature: nvidiaInfo.Temperature, + } + gpuInfos = append(gpuInfos, gpuInfo) + } + + return gpuInfos, nil +} + +func getAMDDetailedInfo() ([]DetailedGPUInfo, error) { + rsmi := &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + } + err := rsmi.Start() + if err != nil { + return nil, err + } + + data, err := rsmi.GatherDetailedInfo() + if err != nil { + return nil, err + } + + var gpuInfos []DetailedGPUInfo + for _, amdInfo := range data { + gpuInfo := DetailedGPUInfo{ + Name: amdInfo.Name, + MemoryTotal: amdInfo.MemoryTotal, + MemoryUsed: amdInfo.MemoryUsed, + Utilization: amdInfo.Utilization, + Temperature: amdInfo.Temperature, + } + gpuInfos = append(gpuInfos, gpuInfo) + } + + return gpuInfos, nil +} \ No newline at end of file diff --git a/monitoring/unit/gpu_detailed_test.go b/monitoring/unit/gpu_detailed_test.go new file mode 100644 index 0000000..d625840 --- /dev/null +++ b/monitoring/unit/gpu_detailed_test.go @@ -0,0 +1,67 @@ +package monitoring + +import ( + "testing" +) + +func TestDetailedGPUDetection(t *testing.T) { + models, err := GetDetailedGPUHost() + if err != nil { + t.Logf("Detailed GPU detection failed (may be normal on non-Linux or non-GPU systems): %v", err) + return + } + + t.Logf("Detected GPUs: %v", models) + + if len(models) > 0 { + usage, err := GetDetailedGPUState() + if err != nil { + t.Logf("GPU state collection failed: %v", err) + } else { + t.Logf("GPU usage: %v", usage) + } + + // 测试详细信息获取 + detailedInfo, err := GetDetailedGPUInfo() + if err != nil { + t.Logf("GPU detailed info collection failed: %v", err) + } else { + for i, info := range detailedInfo { + t.Logf("GPU %d: %s - Memory: %dMB/%dMB, Usage: %.1f%%, Temp: %d°C", + i, info.Name, info.MemoryUsed, info.MemoryTotal, info.Utilization, info.Temperature) + } + } + } +} + +func TestDetailedGPUInfo(t *testing.T) { + detailedInfo, err := GetDetailedGPUInfo() + if err != nil { + t.Logf("GPU detailed info test failed (may be normal): %v", err) + return + } + + if len(detailedInfo) == 0 { + t.Log("No detailed GPU info available") + return + } + + for i, info := range detailedInfo { + t.Logf("GPU %d Details:", i) + t.Logf(" Name: %s", info.Name) + t.Logf(" Memory Total: %d MB", info.MemoryTotal) + t.Logf(" Memory Used: %d MB", info.MemoryUsed) + t.Logf(" Memory Free: %d MB", info.MemoryFree) + t.Logf(" Utilization: %.1f%%", info.Utilization) + t.Logf(" Temperature: %d°C", info.Temperature) + + // 验证数据的合理性 + if info.MemoryTotal > 0 && info.MemoryUsed+info.MemoryFree != info.MemoryTotal { + t.Logf("Warning: Memory usage calculation may be inconsistent for %s", info.Name) + } + + if info.Utilization < 0 || info.Utilization > 100 { + t.Errorf("Invalid utilization value for %s: %.1f%%", info.Name, info.Utilization) + } + } +} diff --git a/monitoring/unit/gpu_linux.go b/monitoring/unit/gpu_linux.go index 889eb19..6a2c2d5 100644 --- a/monitoring/unit/gpu_linux.go +++ b/monitoring/unit/gpu_linux.go @@ -9,13 +9,26 @@ import ( ) func GpuName() string { - accept := []string{"vga", "nvidia", "amd", "radeon", "render"} + // 调整优先级:专用显卡厂商优先,避免只识别集成显卡 + accept := []string{"nvidia", "amd", "radeon", "vga", "3d"} out, err := exec.Command("lspci").Output() if err == nil { lines := strings.Split(string(out), "\n") + + // 首先尝试找专用显卡 for _, line := range lines { + lower := strings.ToLower(line) + + // 跳过集成显卡和管理控制器 + if strings.Contains(lower, "aspeed") || + strings.Contains(lower, "matrox") || + strings.Contains(lower, "management") { + continue + } + + // 优先匹配专用显卡厂商 for _, a := range accept { - if strings.Contains(strings.ToLower(line), a) { + if strings.Contains(lower, a) { parts := strings.SplitN(line, ":", 4) if len(parts) >= 4 { return strings.TrimSpace(parts[3]) @@ -27,6 +40,16 @@ func GpuName() string { } } } + + // 如果没有找到专用显卡,返回第一个VGA设备作为兜底 + for _, line := range lines { + if strings.Contains(strings.ToLower(line), "vga") { + parts := strings.SplitN(line, ":", 4) + if len(parts) >= 3 { + return strings.TrimSpace(parts[2]) + } + } + } } return "None" } diff --git a/monitoring/unit/gpu_nvidia_smi.go b/monitoring/unit/gpu_nvidia_smi.go new file mode 100644 index 0000000..7a84c63 --- /dev/null +++ b/monitoring/unit/gpu_nvidia_smi.go @@ -0,0 +1,200 @@ +package monitoring + +// Modified from https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nvidia_smi/nvidia_smi.go +// Original License: MIT + +import ( + "encoding/xml" + "errors" + "os" + "os/exec" + "strconv" + "strings" +) + +type NvidiaSMI struct { + BinPath string + data []byte +} + +// NVIDIAGPUInfo 包含详细的NVIDIA GPU信息 +type NVIDIAGPUInfo struct { + Name string // GPU型号 + MemoryTotal uint64 // 总显存 (字节) + MemoryUsed uint64 // 已用显存 (字节) + Utilization float64 // GPU使用率 (0-100) + Temperature uint64 // 温度 (摄氏度) +} + +func (smi *NvidiaSMI) GatherModel() ([]string, error) { + return smi.gatherModel() +} + +func (smi *NvidiaSMI) GatherUsage() ([]float64, error) { + return smi.gatherUsage() +} + +// GatherDetailedInfo 获取详细GPU信息 +func (smi *NvidiaSMI) GatherDetailedInfo() ([]NVIDIAGPUInfo, error) { + return smi.gatherDetailedInfo() +} + +func (smi *NvidiaSMI) Start() error { + if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { + binPath, err := exec.LookPath("nvidia-smi") + if err != nil { + return errors.New("nvidia-smi tool not found") + } + smi.BinPath = binPath + } + smi.data = smi.pollNvidiaSMI() + return nil +} + +func (smi *NvidiaSMI) pollNvidiaSMI() []byte { + cmd := exec.Command(smi.BinPath, "-q", "-x") + output, err := cmd.CombinedOutput() + if err != nil { + return nil + } + return output +} + +func (smi *NvidiaSMI) gatherModel() ([]string, error) { + var stats nvidiaSMIXMLResult + var models []string + + if err := xml.Unmarshal(smi.data, &stats); err != nil { + return nil, err + } + + for _, gpu := range stats.GPUs { + if gpu.ProductName != "" { + models = append(models, gpu.ProductName) + } + } + + return models, nil +} + +func (smi *NvidiaSMI) gatherUsage() ([]float64, error) { + var stats nvidiaSMIXMLResult + var usageList []float64 + + if err := xml.Unmarshal(smi.data, &stats); err != nil { + return nil, err + } + + for _, gpu := range stats.GPUs { + usage, err := parsePercentageValue(gpu.Utilization.GPUUtil) + if err != nil { + usage = 0.0 // 默认为0,不中断处理 + } + usageList = append(usageList, usage) + } + + return usageList, nil +} + +func (smi *NvidiaSMI) gatherDetailedInfo() ([]NVIDIAGPUInfo, error) { + var stats nvidiaSMIXMLResult + var gpuInfos []NVIDIAGPUInfo + + if err := xml.Unmarshal(smi.data, &stats); err != nil { + return nil, err + } + + for _, gpu := range stats.GPUs { + utilization, _ := parsePercentageValue(gpu.Utilization.GPUUtil) + memTotal, _ := parseMemoryValue(gpu.FrameBufferMemoryUsage.Total) + memUsed, _ := parseMemoryValue(gpu.FrameBufferMemoryUsage.Used) + temp, _ := parseTemperatureValue(gpu.Temperature.GPUTemp) + + gpuInfo := NVIDIAGPUInfo{ + Name: gpu.ProductName, + MemoryTotal: memTotal, + MemoryUsed: memUsed, + Utilization: utilization, + Temperature: temp, + } + + gpuInfos = append(gpuInfos, gpuInfo) + } + + return gpuInfos, nil +} + +// 解析百分比值 (例如 "25 %" -> 25.0) +func parsePercentageValue(value string) (float64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "%") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0.0, nil + } + + result, err := strconv.ParseFloat(cleaned, 64) + if err != nil { + return 0.0, err + } + + return result, nil +} + +// 解析内存值 (例如 "1024 MiB" -> 1073741824字节) +func parseMemoryValue(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "MiB") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0, nil + } + + result, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + // 转换MiB为字节 (1 MiB = 1024*1024 bytes) + return result * 1024 * 1024, nil +} + +// 解析温度值 (例如 "65 C" -> 65) +func parseTemperatureValue(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "C") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0, nil + } + + result, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + return result, nil +} + +// NVIDIA-SMI XML结构定义 +type nvidiaSMIXMLResult struct { + GPUs []nvidiaSMIGPU `xml:"gpu"` +} + +type nvidiaSMIGPU struct { + ProductName string `xml:"product_name"` + Utilization struct { + GPUUtil string `xml:"gpu_util"` + } `xml:"utilization"` + FrameBufferMemoryUsage struct { + Total string `xml:"total"` + Used string `xml:"used"` + Free string `xml:"free"` + } `xml:"fb_memory_usage"` + Temperature struct { + GPUTemp string `xml:"gpu_temp"` + } `xml:"temperature"` +}