From d185dfdef866225b7e925730c26ef56d13db598c Mon Sep 17 00:00:00 2001 From: Links <61582001+Links17@users.noreply.github.com> Date: Thu, 23 Jan 2025 17:32:28 +0800 Subject: [PATCH 1/3] get Jetson GPU Information --- beszel/internal/agent/gpu.go | 62 ++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index c7afab5..3ecadfc 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "os/exec" + "regexp" "strconv" "strings" "sync" @@ -18,6 +19,7 @@ import ( type GPUManager struct { nvidiaSmi bool rocmSmi bool + tegrastats bool GpuDataMap map[string]*system.GPUData mutex sync.Mutex } @@ -89,6 +91,47 @@ func (c *gpuCollector) collect() error { return c.cmd.Wait() } +// parseJetsonData parses the output of rtegrastats and updates the GPUData map +func (gm *GPUManager) parseJetsonData(output []byte) bool { + data := string(output) + ramPattern := regexp.MustCompile(`RAM (\d+)/(\d+)MB`) + gr3dPattern := regexp.MustCompile(`GR3D_FREQ (\d+)%`) + tempPattern := regexp.MustCompile(`([a-z0-9_]+)@(\d+\.?\d*)C`) + powerPattern := regexp.MustCompile(`VDD_GPU_SOC (\d+)mW`) + gm.mutex.Lock() + defer gm.mutex.Unlock() + gpuData := gm.GpuDataMap["0"] + // Parse RAM usage + ramMatches := ramPattern.FindStringSubmatch(data) + if ramMatches != nil { + gpuData.MemoryUsed, _ = strconv.ParseFloat(ramMatches[1], 64) + gpuData.MemoryTotal, _ = strconv.ParseFloat(ramMatches[2], 64) + } + // Parse GR3D (GPU) usage + gr3dMatches := gr3dPattern.FindStringSubmatch(data) + if gr3dMatches != nil { + usage, _ := strconv.ParseFloat(gr3dMatches[1], 64) + gpuData.Usage = usage / 100 + } + + tempMatches := tempPattern.FindAllStringSubmatch(data, -1) + for _, match := range tempMatches { + if match[1] == "cpu" { + gpuData.Temperature, _ = strconv.ParseFloat(match[2], 64) + break + } + } + + // Parse power usage + powerMatches := powerPattern.FindStringSubmatch(data) + if powerMatches != nil { + power, _ := strconv.ParseFloat(powerMatches[1], 64) + gpuData.Power = power / 1000 + } + gpuData.Count++ + return true +} + // parseNvidiaData parses the output of nvidia-smi and updates the GPUData map func (gm *GPUManager) parseNvidiaData(output []byte) bool { fields := strings.Split(string(output), ", ") @@ -200,10 +243,14 @@ func (gm *GPUManager) detectGPUs() error { if err := exec.Command("rocm-smi").Run(); err == nil { gm.rocmSmi = true } - if gm.nvidiaSmi || gm.rocmSmi { + _, err := exec.LookPath("tegrastats") + if err == nil { + gm.tegrastats = true + } + if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { return nil } - return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi") + return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi or tegrastats") } // startCollector starts the appropriate GPU data collector based on the command @@ -226,7 +273,15 @@ func (gm *GPUManager) startCollector(command string) { parse: gm.parseAmdData, } go amdCollector.start() + case "tegrastats": + jetsonCollector := gpuCollector{ + name: "tegrastats", + cmd: exec.Command("tegrastats"), + parse: gm.parseJetsonData, + } + go jetsonCollector.start() } + } // NewGPUManager creates and initializes a new GPUManager @@ -243,6 +298,9 @@ func NewGPUManager() (*GPUManager, error) { if gm.rocmSmi { gm.startCollector("rocm-smi") } + if gm.tegrastats { + gm.startCollector("tegrastats") + } return &gm, nil } From c157f389573a2e6b4d45c48c9097c7f49f6ae949 Mon Sep 17 00:00:00 2001 From: hank Date: Fri, 24 Jan 2025 22:07:37 -0500 Subject: [PATCH 2/3] gpu: Add closure for Jetson and improve compatibility --- beszel/internal/agent/gpu.go | 76 ++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index 3ecadfc..e33c292 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -91,45 +91,46 @@ func (c *gpuCollector) collect() error { return c.cmd.Wait() } -// parseJetsonData parses the output of rtegrastats and updates the GPUData map -func (gm *GPUManager) parseJetsonData(output []byte) bool { - data := string(output) +// getJetsonParser returns a function to parse the output of tegrastats and update the GPUData map +func (gm *GPUManager) getJetsonParser() func(output []byte) bool { + // use closure to avoid recompiling the regex ramPattern := regexp.MustCompile(`RAM (\d+)/(\d+)MB`) gr3dPattern := regexp.MustCompile(`GR3D_FREQ (\d+)%`) - tempPattern := regexp.MustCompile(`([a-z0-9_]+)@(\d+\.?\d*)C`) - powerPattern := regexp.MustCompile(`VDD_GPU_SOC (\d+)mW`) - gm.mutex.Lock() - defer gm.mutex.Unlock() - gpuData := gm.GpuDataMap["0"] - // Parse RAM usage - ramMatches := ramPattern.FindStringSubmatch(data) - if ramMatches != nil { - gpuData.MemoryUsed, _ = strconv.ParseFloat(ramMatches[1], 64) - gpuData.MemoryTotal, _ = strconv.ParseFloat(ramMatches[2], 64) - } - // Parse GR3D (GPU) usage - gr3dMatches := gr3dPattern.FindStringSubmatch(data) - if gr3dMatches != nil { - usage, _ := strconv.ParseFloat(gr3dMatches[1], 64) - gpuData.Usage = usage / 100 - } + tempPattern := regexp.MustCompile(`tj@(\d+\.?\d*)C`) + // Orin Nano / NX do not have GPU specific power monitor + // TODO: Maybe use VDD_IN for Nano / NX and add a total system power chart + powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`) - tempMatches := tempPattern.FindAllStringSubmatch(data, -1) - for _, match := range tempMatches { - if match[1] == "cpu" { - gpuData.Temperature, _ = strconv.ParseFloat(match[2], 64) - break + return func(output []byte) bool { + gm.mutex.Lock() + defer gm.mutex.Unlock() + data := string(output) + gpuData := gm.GpuDataMap["0"] + // Parse RAM usage + ramMatches := ramPattern.FindStringSubmatch(data) + if ramMatches != nil { + gpuData.MemoryUsed, _ = strconv.ParseFloat(ramMatches[1], 64) + gpuData.MemoryTotal, _ = strconv.ParseFloat(ramMatches[2], 64) } + // Parse GR3D (GPU) usage + gr3dMatches := gr3dPattern.FindStringSubmatch(data) + if gr3dMatches != nil { + gpuData.Usage, _ = strconv.ParseFloat(gr3dMatches[1], 64) + } + // Parse temperature + tempMatches := tempPattern.FindStringSubmatch(data) + if tempMatches != nil { + gpuData.Temperature, _ = strconv.ParseFloat(tempMatches[1], 64) + } + // Parse power usage + powerMatches := powerPattern.FindStringSubmatch(data) + if powerMatches != nil { + power, _ := strconv.ParseFloat(powerMatches[1], 64) + gpuData.Power = power / 1000 + } + gpuData.Count++ + return true } - - // Parse power usage - powerMatches := powerPattern.FindStringSubmatch(data) - if powerMatches != nil { - power, _ := strconv.ParseFloat(powerMatches[1], 64) - gpuData.Power = power / 1000 - } - gpuData.Count++ - return true } // parseNvidiaData parses the output of nvidia-smi and updates the GPUData map @@ -250,7 +251,7 @@ func (gm *GPUManager) detectGPUs() error { if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { return nil } - return fmt.Errorf("no GPU found - install nvidia-smi or rocm-smi or tegrastats") + return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats") } // startCollector starts the appropriate GPU data collector based on the command @@ -276,12 +277,11 @@ func (gm *GPUManager) startCollector(command string) { case "tegrastats": jetsonCollector := gpuCollector{ name: "tegrastats", - cmd: exec.Command("tegrastats"), - parse: gm.parseJetsonData, + cmd: exec.Command("tegrastats", "--interval", "3000"), + parse: gm.getJetsonParser(), } go jetsonCollector.start() } - } // NewGPUManager creates and initializes a new GPUManager From 76347f25e51b8b9f600dbb18f68b38414626b805 Mon Sep 17 00:00:00 2001 From: hank Date: Fri, 24 Jan 2025 23:12:39 -0500 Subject: [PATCH 3/3] fix(gpu): prevent nvidia-smi from running on tegra devices --- beszel/internal/agent/gpu.go | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index e33c292..01f14e9 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -74,17 +74,13 @@ func (c *gpuCollector) collect() error { buf := make([]byte, 0, 8*1024) scanner.Buffer(buf, bufio.MaxScanTokenSize) - hasValidData := false for scanner.Scan() { - if c.parse(scanner.Bytes()) { - hasValidData = true + hasValidData := c.parse(scanner.Bytes()) + if !hasValidData { + return errNoValidData } } - if !hasValidData { - return errNoValidData - } - if err := scanner.Err(); err != nil { return fmt.Errorf("scanner error: %w", err) } @@ -104,8 +100,12 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool { return func(output []byte) bool { gm.mutex.Lock() defer gm.mutex.Unlock() + // we get gpu name from the intitial run of nvidia-smi, so return if it hasn't been initialized + gpuData, ok := gm.GpuDataMap["0"] + if !ok { + return true + } data := string(output) - gpuData := gm.GpuDataMap["0"] // Parse RAM usage ramMatches := ramPattern.FindStringSubmatch(data) if ramMatches != nil { @@ -156,6 +156,12 @@ func (gm *GPUManager) parseNvidiaData(output []byte) bool { if _, ok := gm.GpuDataMap[id]; !ok { name := strings.TrimPrefix(fields[1], "NVIDIA ") gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} + // check if tegrastats is active - if so we will only use nvidia-smi to get gpu name + // - nvidia-smi does not provide metrics for tegra / jetson devices + // this will end the nvidia-smi collector + if gm.tegrastats { + return false + } } // update gpu data gpu := gm.GpuDataMap[id] @@ -235,17 +241,18 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { return gpuData } -// detectGPUs returns the GPU brand (nvidia or amd) or an error if none is found -// todo: make sure there's actually a GPU, not just if the command exists +// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats) +// in the system path. It sets the corresponding flags in the GPUManager struct if any of these +// tools are found. If none of the tools are found, it returns an error indicating that no GPU +// management tools are available. func (gm *GPUManager) detectGPUs() error { - if err := exec.Command("nvidia-smi").Run(); err == nil { + if _, err := exec.LookPath("nvidia-smi"); err == nil { gm.nvidiaSmi = true } - if err := exec.Command("rocm-smi").Run(); err == nil { + if _, err := exec.LookPath("rocm-smi"); err == nil { gm.rocmSmi = true } - _, err := exec.LookPath("tegrastats") - if err == nil { + if _, err := exec.LookPath("tegrastats"); err == nil { gm.tegrastats = true } if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {