From 5304a68d5d1719752415285d530f535c015e023d Mon Sep 17 00:00:00 2001 From: kdwycz Date: Wed, 10 Sep 2025 16:26:55 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=96=B0=E5=A2=9ELinux=20GPU=E7=9B=91?= =?UTF-8?q?=E6=8E=A7=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 参考了nezha-agent。实现了Linux服务器下NVIDIA显卡的监控。 通过 --gpu 参数启用显卡监控功能。 支持多显卡,显卡使用率,显存使用率监控 实现了AMD显卡的监控,但是未经过测试 --- cmd/flags/flag.go | 1 + cmd/root.go | 1 + monitoring/monitoring.go | 41 ++++ monitoring/unit/gpu_amd_rocm_smi.go | 246 +++++++++++++++++++++++ monitoring/unit/gpu_detailed_fallback.go | 31 +++ monitoring/unit/gpu_detailed_linux.go | 213 ++++++++++++++++++++ monitoring/unit/gpu_detailed_test.go | 67 ++++++ monitoring/unit/gpu_linux.go | 27 ++- monitoring/unit/gpu_nvidia_smi.go | 200 ++++++++++++++++++ 9 files changed, 825 insertions(+), 2 deletions(-) create mode 100644 monitoring/unit/gpu_amd_rocm_smi.go create mode 100644 monitoring/unit/gpu_detailed_fallback.go create mode 100644 monitoring/unit/gpu_detailed_linux.go create mode 100644 monitoring/unit/gpu_detailed_test.go create mode 100644 monitoring/unit/gpu_nvidia_smi.go diff --git a/cmd/flags/flag.go b/cmd/flags/flag.go index a415bbf..fe57c68 100644 --- a/cmd/flags/flag.go +++ b/cmd/flags/flag.go @@ -19,4 +19,5 @@ var ( CFAccessClientID string CFAccessClientSecret string MemoryIncludeCache bool + EnableGPU bool // 启用详细GPU监控 ) diff --git a/cmd/root.go b/cmd/root.go index d71d721..0ea230e 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -100,5 +100,6 @@ func init() { RootCmd.PersistentFlags().StringVar(&flags.CFAccessClientID, "cf-access-client-id", "", "Cloudflare Access Client ID") RootCmd.PersistentFlags().StringVar(&flags.CFAccessClientSecret, "cf-access-client-secret", "", "Cloudflare Access Client Secret") RootCmd.PersistentFlags().BoolVar(&flags.MemoryIncludeCache, "memory-include-cache", false, "Include cache/buffer in memory usage") + RootCmd.PersistentFlags().BoolVar(&flags.EnableGPU, "gpu", false, "Enable detailed GPU monitoring (usage, memory, multi-GPU support)") RootCmd.PersistentFlags().ParseErrorsWhitelist.UnknownFlags = true } diff --git a/monitoring/monitoring.go b/monitoring/monitoring.go index 5926c3b..48f21e5 100644 --- a/monitoring/monitoring.go +++ b/monitoring/monitoring.go @@ -5,6 +5,7 @@ import ( "fmt" "log" + "github.com/komari-monitor/komari-agent/cmd/flags" monitoring "github.com/komari-monitor/komari-agent/monitoring/unit" ) @@ -74,6 +75,46 @@ func GenerateReport() []byte { processcount := monitoring.ProcessCount() data["process"] = processcount + // GPU监控 - 根据标志决定详细程度 + if flags.EnableGPU { + // 详细GPU监控模式 + gpuInfo, err := monitoring.GetDetailedGPUInfo() + if err != nil { + message += fmt.Sprintf("failed to get detailed GPU info: %v\n", err) + // 降级到基础GPU信息 + gpuNames, nameErr := monitoring.GetDetailedGPUHost() + if nameErr == nil && len(gpuNames) > 0 { + data["gpu"] = map[string]interface{}{ + "models": gpuNames, + } + } + } else { + // 成功获取详细信息 + gpuData := make([]map[string]interface{}, len(gpuInfo)) + totalGPUUsage := 0.0 + + for i, info := range gpuInfo { + gpuData[i] = map[string]interface{}{ + "name": info.Name, + "memory_total": info.MemoryTotal, + "memory_used": info.MemoryUsed, + "utilization": info.Utilization, + "temperature": info.Temperature, + } + totalGPUUsage += info.Utilization + } + + avgGPUUsage := totalGPUUsage / float64(len(gpuInfo)) + + data["gpu"] = map[string]interface{}{ + "count": len(gpuInfo), + "average_usage": avgGPUUsage, + "detailed_info": gpuData, + } + } + } + // 基础模式下,GPU信息已在basicInfo中处理 + data["message"] = message s, err := json.Marshal(data) diff --git a/monitoring/unit/gpu_amd_rocm_smi.go b/monitoring/unit/gpu_amd_rocm_smi.go new file mode 100644 index 0000000..03a06c8 --- /dev/null +++ b/monitoring/unit/gpu_amd_rocm_smi.go @@ -0,0 +1,246 @@ +package monitoring + +// Modified from https://github.com/influxdata/telegraf/blob/master/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go +// Original License: MIT + +import ( + "encoding/json" + "errors" + "os" + "os/exec" + "strconv" + "strings" +) + +type ROCmSMI struct { + BinPath string + data []byte +} + +// AMDGPUInfo AMD GPU详细信息 +type AMDGPUInfo struct { + Name string // GPU型号 + MemoryTotal uint64 // 总显存 (字节) + MemoryUsed uint64 // 已用显存 (字节) + Utilization float64 // GPU使用率 (0-100) + Temperature uint64 // 温度 (摄氏度) +} + +// ROCmSMI JSON响应结构 +type ROCmResponse map[string]ROCmGPUInfo + +type ROCmGPUInfo struct { + CardSeries string `json:"Card series"` + GPUUsage string `json:"GPU use (%)"` + VRAMTotalMemory string `json:"VRAM Total Memory (B)"` + VRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"` + TemperatureJunction string `json:"Temperature (Sensor junction) (C)"` +} + +func (rsmi *ROCmSMI) GatherModel() ([]string, error) { + return rsmi.gatherModel() +} + +func (rsmi *ROCmSMI) GatherUsage() ([]float64, error) { + return rsmi.gatherUsage() +} + +// GatherDetailedInfo 获取详细GPU信息 +func (rsmi *ROCmSMI) GatherDetailedInfo() ([]AMDGPUInfo, error) { + return rsmi.gatherDetailedInfo() +} + +func (rsmi *ROCmSMI) Start() error { + if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) { + binPath, err := exec.LookPath("rocm-smi") + if err != nil { + return errors.New("rocm-smi tool not found") + } + rsmi.BinPath = binPath + } + + rsmi.data = rsmi.pollROCmSMI() + return nil +} + +func (rsmi *ROCmSMI) pollROCmSMI() []byte { + cmd := exec.Command(rsmi.BinPath, "--showallinfo", "--json") + output, err := cmd.CombinedOutput() + if err != nil { + return nil + } + return output +} + +func (rsmi *ROCmSMI) gatherModel() ([]string, error) { + var data map[string]interface{} + var models []string + + if err := json.Unmarshal(rsmi.data, &data); err != nil { + return nil, err + } + + // 解析JSON结构获取GPU型号 + for key, value := range data { + if strings.HasPrefix(key, "card") { + if cardData, ok := value.(map[string]interface{}); ok { + if name, exists := cardData["Card series"]; exists { + if nameStr, ok := name.(string); ok && nameStr != "" { + models = append(models, nameStr) + } + } + } + } + } + + return models, nil +} + +func (rsmi *ROCmSMI) gatherUsage() ([]float64, error) { + var data map[string]interface{} + var usageList []float64 + + if err := json.Unmarshal(rsmi.data, &data); err != nil { + return nil, err + } + + // 解析JSON结构获取GPU使用率 + for key, value := range data { + if strings.HasPrefix(key, "card") { + if cardData, ok := value.(map[string]interface{}); ok { + usage := 0.0 + if utilizationData, exists := cardData["GPU use (%)"]; exists { + if utilizationStr, ok := utilizationData.(string); ok { + if parsed, err := parseAMDPercentage(utilizationStr); err == nil { + usage = parsed + } + } + } + usageList = append(usageList, usage) + } + } + } + + return usageList, nil +} + +func (rsmi *ROCmSMI) gatherDetailedInfo() ([]AMDGPUInfo, error) { + if rsmi.data == nil { + return nil, errors.New("no data available") + } + + var data map[string]interface{} + var gpuInfos []AMDGPUInfo + + if err := json.Unmarshal(rsmi.data, &data); err != nil { + return nil, err + } + + // 解析每个GPU卡的详细信息 + for key, value := range data { + if strings.HasPrefix(key, "card") { + if cardData, ok := value.(map[string]interface{}); ok { + gpuInfo := AMDGPUInfo{} + + // 获取GPU名称 + if name, exists := cardData["Card series"]; exists { + if nameStr, ok := name.(string); ok { + gpuInfo.Name = nameStr + } + } + + // 获取使用率 + if utilizationData, exists := cardData["GPU use (%)"]; exists { + if utilizationStr, ok := utilizationData.(string); ok { + if usage, err := parseAMDPercentage(utilizationStr); err == nil { + gpuInfo.Utilization = usage + } + } + } + + // 获取显存信息 + if memUsedData, exists := cardData["VRAM Total Used Memory (B)"]; exists { + if memUsedStr, ok := memUsedData.(string); ok { + if memUsed, err := parseAMDMemoryBytes(memUsedStr); err == nil { + gpuInfo.MemoryUsed = memUsed + } + } + } + + if memTotalData, exists := cardData["VRAM Total Memory (B)"]; exists { + if memTotalStr, ok := memTotalData.(string); ok { + if memTotal, err := parseAMDMemoryBytes(memTotalStr); err == nil { + gpuInfo.MemoryTotal = memTotal + } + } + } + + // 获取温度信息 + if tempData, exists := cardData["Temperature (Sensor junction) (C)"]; exists { + if tempStr, ok := tempData.(string); ok { + if temp, err := parseAMDTemperature(tempStr); err == nil { + gpuInfo.Temperature = temp + } + } + } + + gpuInfos = append(gpuInfos, gpuInfo) + } + } + } + + return gpuInfos, nil +} + +// 解析AMD百分比值 (例如 "25" -> 25.0) +func parseAMDPercentage(value string) (float64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "%") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0.0, nil + } + + result, err := strconv.ParseFloat(cleaned, 64) + if err != nil { + return 0.0, err + } + + return result, nil +} + +// 解析AMD显存字节 (例如 "1073741824" -> 1073741824字节) +func parseAMDMemoryBytes(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + + if cleaned == "" { + return 0, nil + } + + bytes, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + // 直接返回字节数 + return bytes, nil +} + +// 解析AMD温度值 (例如 "65" -> 65) +func parseAMDTemperature(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "C") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0, nil + } + + result, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + return result, nil +} diff --git a/monitoring/unit/gpu_detailed_fallback.go b/monitoring/unit/gpu_detailed_fallback.go new file mode 100644 index 0000000..84954a1 --- /dev/null +++ b/monitoring/unit/gpu_detailed_fallback.go @@ -0,0 +1,31 @@ +//go:build !linux + +package monitoring + +import ( + "errors" +) + +// DetailedGPUInfo 详细GPU信息结构体 +type DetailedGPUInfo struct { + Name string `json:"name"` // GPU型号 + MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节) + MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节) + Utilization float64 `json:"utilization"` // GPU使用率 (0-100) + Temperature uint64 `json:"temperature"` // 温度 (摄氏度) +} + +// GetDetailedGPUHost 获取GPU型号信息 - 回退实现 +func GetDetailedGPUHost() ([]string, error) { + return nil, errors.New("detailed GPU monitoring not supported on this platform") +} + +// GetDetailedGPUState 获取GPU使用率 - 回退实现 +func GetDetailedGPUState() ([]float64, error) { + return nil, errors.New("detailed GPU monitoring not supported on this platform") +} + +// GetDetailedGPUInfo 获取详细GPU信息 - 回退实现 +func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) { + return nil, errors.New("detailed GPU monitoring not supported on this platform") +} diff --git a/monitoring/unit/gpu_detailed_linux.go b/monitoring/unit/gpu_detailed_linux.go new file mode 100644 index 0000000..087f07c --- /dev/null +++ b/monitoring/unit/gpu_detailed_linux.go @@ -0,0 +1,213 @@ +//go:build linux + +package monitoring + +import ( + "errors" +) + +const ( + vendorAMD = iota + 1 + vendorNVIDIA +) + +var vendorType = getDetailedVendor() + +// DetailedGPUInfo 详细GPU信息结构体 +type DetailedGPUInfo struct { + Name string `json:"name"` // GPU型号 + MemoryTotal uint64 `json:"memory_total"` // 总显存 (字节) + MemoryUsed uint64 `json:"memory_used"` // 已用显存 (字节) + Utilization float64 `json:"utilization"` // GPU使用率 (0-100) + Temperature uint64 `json:"temperature"` // 温度 (摄氏度) +} + +func getDetailedVendor() uint8 { + _, err := getNvidiaDetailedStat() + if err != nil { + return vendorAMD + } else { + return vendorNVIDIA + } +} + +func getNvidiaDetailedStat() ([]float64, error) { + smi := &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + } + err1 := smi.Start() + if err1 != nil { + return nil, err1 + } + data, err2 := smi.GatherUsage() + if err2 != nil { + return nil, err2 + } + return data, nil +} + +func getAMDDetailedStat() ([]float64, error) { + rsmi := &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + } + err := rsmi.Start() + if err != nil { + return nil, err + } + data, err := rsmi.GatherUsage() + if err != nil { + return nil, err + } + return data, nil +} + +func getNvidiaDetailedHost() ([]string, error) { + smi := &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + } + err := smi.Start() + if err != nil { + return nil, err + } + data, err := smi.GatherModel() + if err != nil { + return nil, err + } + return data, nil +} + +func getAMDDetailedHost() ([]string, error) { + rsmi := &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + } + err := rsmi.Start() + if err != nil { + return nil, err + } + data, err := rsmi.GatherModel() + if err != nil { + return nil, err + } + return data, nil +} + +// GetDetailedGPUHost 获取GPU型号信息 +func GetDetailedGPUHost() ([]string, error) { + var gi []string + var err error + + switch vendorType { + case vendorAMD: + gi, err = getAMDDetailedHost() + case vendorNVIDIA: + gi, err = getNvidiaDetailedHost() + default: + return nil, errors.New("invalid vendor") + } + + if err != nil { + return nil, err + } + + return gi, nil +} + +// GetDetailedGPUState 获取GPU使用率 +func GetDetailedGPUState() ([]float64, error) { + var gs []float64 + var err error + + switch vendorType { + case vendorAMD: + gs, err = getAMDDetailedStat() + case vendorNVIDIA: + gs, err = getNvidiaDetailedStat() + default: + return nil, errors.New("invalid vendor") + } + + if err != nil { + return nil, err + } + + return gs, nil +} + +// GetDetailedGPUInfo 获取详细GPU信息 +func GetDetailedGPUInfo() ([]DetailedGPUInfo, error) { + var gpuInfos []DetailedGPUInfo + var err error + + switch vendorType { + case vendorAMD: + gpuInfos, err = getAMDDetailedInfo() + case vendorNVIDIA: + gpuInfos, err = getNvidiaDetailedInfo() + default: + return nil, errors.New("invalid vendor") + } + + if err != nil { + return nil, err + } + + return gpuInfos, nil +} + +func getNvidiaDetailedInfo() ([]DetailedGPUInfo, error) { + smi := &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + } + err := smi.Start() + if err != nil { + return nil, err + } + + data, err := smi.GatherDetailedInfo() + if err != nil { + return nil, err + } + + var gpuInfos []DetailedGPUInfo + for _, nvidiaInfo := range data { + gpuInfo := DetailedGPUInfo{ + Name: nvidiaInfo.Name, + MemoryTotal: nvidiaInfo.MemoryTotal, + MemoryUsed: nvidiaInfo.MemoryUsed, + Utilization: nvidiaInfo.Utilization, + Temperature: nvidiaInfo.Temperature, + } + gpuInfos = append(gpuInfos, gpuInfo) + } + + return gpuInfos, nil +} + +func getAMDDetailedInfo() ([]DetailedGPUInfo, error) { + rsmi := &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + } + err := rsmi.Start() + if err != nil { + return nil, err + } + + data, err := rsmi.GatherDetailedInfo() + if err != nil { + return nil, err + } + + var gpuInfos []DetailedGPUInfo + for _, amdInfo := range data { + gpuInfo := DetailedGPUInfo{ + Name: amdInfo.Name, + MemoryTotal: amdInfo.MemoryTotal, + MemoryUsed: amdInfo.MemoryUsed, + Utilization: amdInfo.Utilization, + Temperature: amdInfo.Temperature, + } + gpuInfos = append(gpuInfos, gpuInfo) + } + + return gpuInfos, nil +} \ No newline at end of file diff --git a/monitoring/unit/gpu_detailed_test.go b/monitoring/unit/gpu_detailed_test.go new file mode 100644 index 0000000..d625840 --- /dev/null +++ b/monitoring/unit/gpu_detailed_test.go @@ -0,0 +1,67 @@ +package monitoring + +import ( + "testing" +) + +func TestDetailedGPUDetection(t *testing.T) { + models, err := GetDetailedGPUHost() + if err != nil { + t.Logf("Detailed GPU detection failed (may be normal on non-Linux or non-GPU systems): %v", err) + return + } + + t.Logf("Detected GPUs: %v", models) + + if len(models) > 0 { + usage, err := GetDetailedGPUState() + if err != nil { + t.Logf("GPU state collection failed: %v", err) + } else { + t.Logf("GPU usage: %v", usage) + } + + // 测试详细信息获取 + detailedInfo, err := GetDetailedGPUInfo() + if err != nil { + t.Logf("GPU detailed info collection failed: %v", err) + } else { + for i, info := range detailedInfo { + t.Logf("GPU %d: %s - Memory: %dMB/%dMB, Usage: %.1f%%, Temp: %d°C", + i, info.Name, info.MemoryUsed, info.MemoryTotal, info.Utilization, info.Temperature) + } + } + } +} + +func TestDetailedGPUInfo(t *testing.T) { + detailedInfo, err := GetDetailedGPUInfo() + if err != nil { + t.Logf("GPU detailed info test failed (may be normal): %v", err) + return + } + + if len(detailedInfo) == 0 { + t.Log("No detailed GPU info available") + return + } + + for i, info := range detailedInfo { + t.Logf("GPU %d Details:", i) + t.Logf(" Name: %s", info.Name) + t.Logf(" Memory Total: %d MB", info.MemoryTotal) + t.Logf(" Memory Used: %d MB", info.MemoryUsed) + t.Logf(" Memory Free: %d MB", info.MemoryFree) + t.Logf(" Utilization: %.1f%%", info.Utilization) + t.Logf(" Temperature: %d°C", info.Temperature) + + // 验证数据的合理性 + if info.MemoryTotal > 0 && info.MemoryUsed+info.MemoryFree != info.MemoryTotal { + t.Logf("Warning: Memory usage calculation may be inconsistent for %s", info.Name) + } + + if info.Utilization < 0 || info.Utilization > 100 { + t.Errorf("Invalid utilization value for %s: %.1f%%", info.Name, info.Utilization) + } + } +} diff --git a/monitoring/unit/gpu_linux.go b/monitoring/unit/gpu_linux.go index 889eb19..6a2c2d5 100644 --- a/monitoring/unit/gpu_linux.go +++ b/monitoring/unit/gpu_linux.go @@ -9,13 +9,26 @@ import ( ) func GpuName() string { - accept := []string{"vga", "nvidia", "amd", "radeon", "render"} + // 调整优先级:专用显卡厂商优先,避免只识别集成显卡 + accept := []string{"nvidia", "amd", "radeon", "vga", "3d"} out, err := exec.Command("lspci").Output() if err == nil { lines := strings.Split(string(out), "\n") + + // 首先尝试找专用显卡 for _, line := range lines { + lower := strings.ToLower(line) + + // 跳过集成显卡和管理控制器 + if strings.Contains(lower, "aspeed") || + strings.Contains(lower, "matrox") || + strings.Contains(lower, "management") { + continue + } + + // 优先匹配专用显卡厂商 for _, a := range accept { - if strings.Contains(strings.ToLower(line), a) { + if strings.Contains(lower, a) { parts := strings.SplitN(line, ":", 4) if len(parts) >= 4 { return strings.TrimSpace(parts[3]) @@ -27,6 +40,16 @@ func GpuName() string { } } } + + // 如果没有找到专用显卡,返回第一个VGA设备作为兜底 + for _, line := range lines { + if strings.Contains(strings.ToLower(line), "vga") { + parts := strings.SplitN(line, ":", 4) + if len(parts) >= 3 { + return strings.TrimSpace(parts[2]) + } + } + } } return "None" } diff --git a/monitoring/unit/gpu_nvidia_smi.go b/monitoring/unit/gpu_nvidia_smi.go new file mode 100644 index 0000000..7a84c63 --- /dev/null +++ b/monitoring/unit/gpu_nvidia_smi.go @@ -0,0 +1,200 @@ +package monitoring + +// Modified from https://github.com/influxdata/telegraf/blob/master/plugins/inputs/nvidia_smi/nvidia_smi.go +// Original License: MIT + +import ( + "encoding/xml" + "errors" + "os" + "os/exec" + "strconv" + "strings" +) + +type NvidiaSMI struct { + BinPath string + data []byte +} + +// NVIDIAGPUInfo 包含详细的NVIDIA GPU信息 +type NVIDIAGPUInfo struct { + Name string // GPU型号 + MemoryTotal uint64 // 总显存 (字节) + MemoryUsed uint64 // 已用显存 (字节) + Utilization float64 // GPU使用率 (0-100) + Temperature uint64 // 温度 (摄氏度) +} + +func (smi *NvidiaSMI) GatherModel() ([]string, error) { + return smi.gatherModel() +} + +func (smi *NvidiaSMI) GatherUsage() ([]float64, error) { + return smi.gatherUsage() +} + +// GatherDetailedInfo 获取详细GPU信息 +func (smi *NvidiaSMI) GatherDetailedInfo() ([]NVIDIAGPUInfo, error) { + return smi.gatherDetailedInfo() +} + +func (smi *NvidiaSMI) Start() error { + if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { + binPath, err := exec.LookPath("nvidia-smi") + if err != nil { + return errors.New("nvidia-smi tool not found") + } + smi.BinPath = binPath + } + smi.data = smi.pollNvidiaSMI() + return nil +} + +func (smi *NvidiaSMI) pollNvidiaSMI() []byte { + cmd := exec.Command(smi.BinPath, "-q", "-x") + output, err := cmd.CombinedOutput() + if err != nil { + return nil + } + return output +} + +func (smi *NvidiaSMI) gatherModel() ([]string, error) { + var stats nvidiaSMIXMLResult + var models []string + + if err := xml.Unmarshal(smi.data, &stats); err != nil { + return nil, err + } + + for _, gpu := range stats.GPUs { + if gpu.ProductName != "" { + models = append(models, gpu.ProductName) + } + } + + return models, nil +} + +func (smi *NvidiaSMI) gatherUsage() ([]float64, error) { + var stats nvidiaSMIXMLResult + var usageList []float64 + + if err := xml.Unmarshal(smi.data, &stats); err != nil { + return nil, err + } + + for _, gpu := range stats.GPUs { + usage, err := parsePercentageValue(gpu.Utilization.GPUUtil) + if err != nil { + usage = 0.0 // 默认为0,不中断处理 + } + usageList = append(usageList, usage) + } + + return usageList, nil +} + +func (smi *NvidiaSMI) gatherDetailedInfo() ([]NVIDIAGPUInfo, error) { + var stats nvidiaSMIXMLResult + var gpuInfos []NVIDIAGPUInfo + + if err := xml.Unmarshal(smi.data, &stats); err != nil { + return nil, err + } + + for _, gpu := range stats.GPUs { + utilization, _ := parsePercentageValue(gpu.Utilization.GPUUtil) + memTotal, _ := parseMemoryValue(gpu.FrameBufferMemoryUsage.Total) + memUsed, _ := parseMemoryValue(gpu.FrameBufferMemoryUsage.Used) + temp, _ := parseTemperatureValue(gpu.Temperature.GPUTemp) + + gpuInfo := NVIDIAGPUInfo{ + Name: gpu.ProductName, + MemoryTotal: memTotal, + MemoryUsed: memUsed, + Utilization: utilization, + Temperature: temp, + } + + gpuInfos = append(gpuInfos, gpuInfo) + } + + return gpuInfos, nil +} + +// 解析百分比值 (例如 "25 %" -> 25.0) +func parsePercentageValue(value string) (float64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "%") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0.0, nil + } + + result, err := strconv.ParseFloat(cleaned, 64) + if err != nil { + return 0.0, err + } + + return result, nil +} + +// 解析内存值 (例如 "1024 MiB" -> 1073741824字节) +func parseMemoryValue(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "MiB") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0, nil + } + + result, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + // 转换MiB为字节 (1 MiB = 1024*1024 bytes) + return result * 1024 * 1024, nil +} + +// 解析温度值 (例如 "65 C" -> 65) +func parseTemperatureValue(value string) (uint64, error) { + cleaned := strings.TrimSpace(value) + cleaned = strings.TrimSuffix(cleaned, "C") + cleaned = strings.TrimSpace(cleaned) + + if cleaned == "" { + return 0, nil + } + + result, err := strconv.ParseUint(cleaned, 10, 64) + if err != nil { + return 0, err + } + + return result, nil +} + +// NVIDIA-SMI XML结构定义 +type nvidiaSMIXMLResult struct { + GPUs []nvidiaSMIGPU `xml:"gpu"` +} + +type nvidiaSMIGPU struct { + ProductName string `xml:"product_name"` + Utilization struct { + GPUUtil string `xml:"gpu_util"` + } `xml:"utilization"` + FrameBufferMemoryUsage struct { + Total string `xml:"total"` + Used string `xml:"used"` + Free string `xml:"free"` + } `xml:"fb_memory_usage"` + Temperature struct { + GPUTemp string `xml:"gpu_temp"` + } `xml:"temperature"` +} From ad3c02c22c281b98fe7acecd14969da0957e9f24 Mon Sep 17 00:00:00 2001 From: kdwycz Date: Mon, 15 Sep 2025 20:23:56 +0800 Subject: [PATCH 2/3] Merge branch 'main' into dev --- install.ps1 | 1 + install.sh | 39 +++++++-- monitoring/unit/mem.go | 2 +- monitoring/unit/os_linux.go | 163 +++++++++++++++++++++++++++++++----- 4 files changed, 176 insertions(+), 29 deletions(-) diff --git a/install.ps1 b/install.ps1 index a5869ba..2579296 100644 --- a/install.ps1 +++ b/install.ps1 @@ -40,6 +40,7 @@ if ($GitHubProxy -ne '') { $ProxyDisplay = $GitHubProxy } else { $ProxyDisplay = switch ($env:PROCESSOR_ARCHITECTURE) { 'AMD64' { $arch = 'amd64' } 'ARM64' { $arch = 'arm64' } + 'x86' { $arch = '386' } Default { Log-Error "Unsupported architecture: $env:PROCESSOR_ARCHITECTURE"; exit 1 } } diff --git a/install.sh b/install.sh index 4d68bd7..8d24b33 100755 --- a/install.sh +++ b/install.sh @@ -57,6 +57,13 @@ case $os_type in Linux) os_name="linux" ;; + FreeBSD) + os_name="freebsd" + ;; + MINGW*|MSYS*|CYGWIN*) + os_name="windows" + target_dir="/c/komari" # Use C:\komari on Windows + ;; *) log_error "Unsupported operating system: $os_type" exit 1 @@ -295,23 +302,45 @@ install_dependencies # Install vnstat if needed for month-rotate install_vnstat +# Architecture detection with platform-specific support arch=$(uname -m) case $arch in x86_64) arch="amd64" ;; - aarch64) + aarch64|arm64) arch="arm64" ;; - arm64) - arch="arm64" + i386|i686) + # x86 (32-bit) support + case $os_name in + freebsd|linux|windows) + arch="386" + ;; + *) + log_error "32-bit x86 architecture not supported on $os_name" + exit 1 + ;; + esac + ;; + armv7*|armv6*) + # ARM 32-bit support + case $os_name in + freebsd|linux) + arch="arm" + ;; + *) + log_error "32-bit ARM architecture not supported on $os_name" + exit 1 + ;; + esac ;; *) - log_error "Unsupported architecture: $arch" + log_error "Unsupported architecture: $arch on $os_name" exit 1 ;; esac -log_info "Detected architecture: ${GREEN}$arch${NC}" +log_info "Detected OS: ${GREEN}$os_name${NC}, Architecture: ${GREEN}$arch${NC}" version_to_install="latest" if [ -n "$install_version" ]; then diff --git a/monitoring/unit/mem.go b/monitoring/unit/mem.go index 4926398..1b8f7e2 100644 --- a/monitoring/unit/mem.go +++ b/monitoring/unit/mem.go @@ -24,7 +24,7 @@ func Ram() RamInfo { return raminfo } raminfo.Total = v.Total - raminfo.Used = v.Used + raminfo.Used = v.Total - v.Available return raminfo } diff --git a/monitoring/unit/os_linux.go b/monitoring/unit/os_linux.go index eb2fae0..3caa991 100644 --- a/monitoring/unit/os_linux.go +++ b/monitoring/unit/os_linux.go @@ -11,14 +11,21 @@ import ( ) func OSName() string { + // Check if it's an Android system + if androidVersion := detectAndroid(); androidVersion != "" { + return androidVersion + } + + // Check if it's a Proxmox VE if pveVersion := detectProxmoxVE(); pveVersion != "" { return pveVersion } - + + // Check if it's a Synology if synologyName := detectSynology(); synologyName != "" { return synologyName } - + file, err := os.Open("/etc/os-release") if err != nil { return "Linux" @@ -45,7 +52,7 @@ func detectSynology() string { "/etc/synoinfo.conf", "/etc.defaults/synoinfo.conf", } - + for _, file := range synologyFiles { if info, err := os.Stat(file); err == nil && !info.IsDir() { if synologyInfo := readSynologyInfo(file); synologyInfo != "" { @@ -53,11 +60,11 @@ func detectSynology() string { } } } - + if info, err := os.Stat("/usr/syno"); err == nil && info.IsDir() { return "Synology DSM" } - + return "" } @@ -67,37 +74,37 @@ func readSynologyInfo(filename string) string { return "" } defer file.Close() - + var unique, udcCheckState string - + scanner := bufio.NewScanner(file) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) - + if strings.HasPrefix(line, "unique=") { unique = strings.Trim(strings.TrimPrefix(line, "unique="), `"`) } else if strings.HasPrefix(line, "udc_check_state=") { udcCheckState = strings.Trim(strings.TrimPrefix(line, "udc_check_state="), `"`) } } - + if unique != "" && strings.Contains(unique, "synology_") { parts := strings.Split(unique, "_") if len(parts) >= 3 { model := strings.ToUpper(parts[len(parts)-1]) - + result := "Synology " + model - + if udcCheckState != "" { result += " DSM " + udcCheckState } else { result += " DSM" } - + return result } } - + return "" } @@ -105,21 +112,21 @@ func detectProxmoxVE() string { if _, err := exec.LookPath("pveversion"); err != nil { return "" } - + out, err := exec.Command("pveversion").Output() if err != nil { return "" } - + output := strings.TrimSpace(string(out)) lines := strings.Split(output, "\n") - + var version string var codename string - + for _, line := range lines { line = strings.TrimSpace(line) - + if strings.HasPrefix(line, "pve-manager/") { parts := strings.Split(line, "/") if len(parts) >= 2 { @@ -130,9 +137,9 @@ func detectProxmoxVE() string { version = versionPart } } - + } - + if version != "" { if file, err := os.Open("/etc/os-release"); err == nil { defer file.Close() @@ -146,23 +153,133 @@ func detectProxmoxVE() string { } } } - + if version != "" { if codename != "" { return "Proxmox VE " + version + " (" + codename + ")" } return "Proxmox VE " + version } - + return "Proxmox VE" } +// detectAndroid detects if the system is Android and returns detailed information +func detectAndroid() string { + // 1. Try to get Android version info using the getprop command + cmdGetprop := exec.Command("getprop", "ro.build.version.release") + version, err := cmdGetprop.Output() + if err == nil && len(version) > 0 { + versionStr := strings.TrimSpace(string(version)) + + // Get device model + cmdGetModel := exec.Command("getprop", "ro.product.model") + model, err2 := cmdGetModel.Output() + modelStr := strings.TrimSpace(string(model)) + + // Get manufacturer information + cmdGetBrand := exec.Command("getprop", "ro.product.brand") + brand, err3 := cmdGetBrand.Output() + brandStr := strings.TrimSpace(string(brand)) + + result := "Android " + versionStr + + // Add brand and model information (if available) + if err2 == nil && modelStr != "" { + if err3 == nil && brandStr != "" && brandStr != modelStr { + result += " (" + brandStr + " " + modelStr + ")" + } else { + result += " (" + modelStr + ")" + } + } + + return result + } + + // 2. Try to check Android system build.prop file + if _, err := os.Stat("/system/build.prop"); err == nil { + return readAndroidBuildProp() + } + + // 3. Check for typical Android directory structure + if isAndroidSystem() { + return "Android" + } + + return "" +} + +// readAndroidBuildProp reads Android version information from build.prop file +func readAndroidBuildProp() string { + file, err := os.Open("/system/build.prop") + if err != nil { + return "Android" + } + defer file.Close() + + var version, model, brand string + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + if strings.HasPrefix(line, "ro.build.version.release=") { + version = strings.TrimPrefix(line, "ro.build.version.release=") + } else if strings.HasPrefix(line, "ro.product.model=") { + model = strings.TrimPrefix(line, "ro.product.model=") + } else if strings.HasPrefix(line, "ro.product.brand=") { + brand = strings.TrimPrefix(line, "ro.product.brand=") + } + + // If all information has been collected, we can exit early + if version != "" && model != "" && brand != "" { + break + } + } + + if version != "" { + result := "Android " + version + + if model != "" { + if brand != "" && brand != model { + result += " (" + brand + " " + model + ")" + } else { + result += " (" + model + ")" + } + } + + return result + } + + return "Android" +} + +// isAndroidSystem determines if the system is Android by checking typical directory structure +func isAndroidSystem() bool { + androidDirs := []string{ + "/system/app", + "/system/priv-app", + "/data/app", + "/sdcard", + } + + dirCount := 0 + for _, dir := range androidDirs { + if info, err := os.Stat(dir); err == nil && info.IsDir() { + dirCount++ + } + } + + // Consider it an Android system only if at least 2 directories exist + return dirCount >= 2 +} + // KernelVersion returns the kernel version on Linux systems func KernelVersion() string { out, err := exec.Command("uname", "-r").Output() if err != nil { return "Unknown" } - + return strings.TrimSpace(string(out)) } From 396fe5cfc28eef6ee6d96a153c74ca8abbfab00a Mon Sep 17 00:00:00 2001 From: Akizon77 Date: Tue, 16 Sep 2025 00:28:58 +0800 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20=E6=8E=92=E9=99=A4hugetlbfs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- monitoring/unit/disk.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/monitoring/unit/disk.go b/monitoring/unit/disk.go index 6bd01dd..d5554ba 100644 --- a/monitoring/unit/disk.go +++ b/monitoring/unit/disk.go @@ -77,6 +77,7 @@ func isPhysicalDisk(part disk.PartitionStat) bool { "/dev/mqueue", "/etc/resolv.conf", "/etc/host", // /etc/hosts,/etc/hostname + "/dev/hugepages", } for _, mp := range mountpointsToExclude { if mountpoint == mp || strings.HasPrefix(mountpoint, mp) { @@ -100,6 +101,7 @@ func isPhysicalDisk(part disk.PartitionStat) bool { "sysfs", "cgroup", "mqueue", + "hugetlbfs", } for _, fs := range fstypeToExclude { if fstype == fs || strings.HasPrefix(fstype, fs) {