remove nvidia-smi dependency for jetson / tegrastats (#286)

This commit is contained in:
henrygd
2025-04-07 20:01:03 -04:00
parent 93c3c7b9d8
commit d79111fce4
2 changed files with 17 additions and 36 deletions

View File

@@ -125,14 +125,13 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
// TODO: Maybe use VDD_IN for Nano / NX and add a total system power chart // TODO: Maybe use VDD_IN for Nano / NX and add a total system power chart
powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`) powerPattern := regexp.MustCompile(`(GPU_SOC|CPU_GPU_CV) (\d+)mW`)
// jetson devices have only one gpu so we'll just initialize here
gpuData := &system.GPUData{Name: "GPU"}
gm.GpuDataMap["0"] = gpuData
return func(output []byte) bool { return func(output []byte) bool {
gm.Lock() gm.Lock()
defer gm.Unlock() defer gm.Unlock()
// we get gpu name from the intitial run of nvidia-smi, so return if it hasn't been initialized
gpuData, ok := gm.GpuDataMap["0"]
if !ok {
return true
}
// Parse RAM usage // Parse RAM usage
ramMatches := ramPattern.FindSubmatch(output) ramMatches := ramPattern.FindSubmatch(output)
if ramMatches != nil { if ramMatches != nil {
@@ -184,12 +183,6 @@ func (gm *GPUManager) parseNvidiaData(output []byte) bool {
if _, ok := gm.GpuDataMap[id]; !ok { if _, ok := gm.GpuDataMap[id]; !ok {
name := strings.TrimPrefix(fields[1], "NVIDIA ") name := strings.TrimPrefix(fields[1], "NVIDIA ")
gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")} gm.GpuDataMap[id] = &system.GPUData{Name: strings.TrimSuffix(name, " Laptop GPU")}
// check if tegrastats is active - if so we will only use nvidia-smi to get gpu name
// - nvidia-smi does not provide metrics for tegra / jetson devices
// this will end the nvidia-smi collector
if gm.tegrastats {
return false
}
} }
// update gpu data // update gpu data
gpu := gm.GpuDataMap[id] gpu := gm.GpuDataMap[id]
@@ -283,6 +276,7 @@ func (gm *GPUManager) detectGPUs() error {
} }
if _, err := exec.LookPath(tegraStatsCmd); err == nil { if _, err := exec.LookPath(tegraStatsCmd); err == nil {
gm.tegrastats = true gm.tegrastats = true
gm.nvidiaSmi = false
} }
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats { if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
return nil return nil
@@ -297,9 +291,11 @@ func (gm *GPUManager) startCollector(command string) {
} }
switch command { switch command {
case nvidiaSmiCmd: case nvidiaSmiCmd:
collector.cmdArgs = []string{"-l", nvidiaSmiInterval, collector.cmdArgs = []string{
"-l", nvidiaSmiInterval,
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw", "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
"--format=csv,noheader,nounits"} "--format=csv,noheader,nounits",
}
collector.parse = gm.parseNvidiaData collector.parse = gm.parseNvidiaData
go collector.start() go collector.start()
case tegraStatsCmd: case tegraStatsCmd:

View File

@@ -251,14 +251,13 @@ func TestParseJetsonData(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
input string input string
gm *GPUManager
wantMetrics *system.GPUData wantMetrics *system.GPUData
}{ }{
{ {
name: "valid data", name: "valid data",
input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% tj@52.468C VDD_GPU_SOC 2171mW", input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% tj@52.468C VDD_GPU_SOC 2171mW",
wantMetrics: &system.GPUData{ wantMetrics: &system.GPUData{
Name: "Jetson", Name: "GPU",
MemoryUsed: 4300.0, MemoryUsed: 4300.0,
MemoryTotal: 30698.0, MemoryTotal: 30698.0,
Usage: 45.0, Usage: 45.0,
@@ -271,7 +270,7 @@ func TestParseJetsonData(t *testing.T) {
name: "more valid data", name: "more valid data",
input: "11-15-2024 08:38:09 RAM 6185/7620MB (lfb 8x2MB) SWAP 851/3810MB (cached 1MB) CPU [15%@729,11%@729,14%@729,13%@729,11%@729,8%@729] EMC_FREQ 43%@2133 GR3D_FREQ 63%@[621] NVDEC off NVJPG off NVJPG1 off VIC off OFA off APE 200 cpu@53.968C soc2@52.437C soc0@50.75C gpu@53.343C tj@53.968C soc1@51.656C VDD_IN 12479mW/12479mW VDD_CPU_GPU_CV 4667mW/4667mW VDD_SOC 2817mW/2817mW", input: "11-15-2024 08:38:09 RAM 6185/7620MB (lfb 8x2MB) SWAP 851/3810MB (cached 1MB) CPU [15%@729,11%@729,14%@729,13%@729,11%@729,8%@729] EMC_FREQ 43%@2133 GR3D_FREQ 63%@[621] NVDEC off NVJPG off NVJPG1 off VIC off OFA off APE 200 cpu@53.968C soc2@52.437C soc0@50.75C gpu@53.343C tj@53.968C soc1@51.656C VDD_IN 12479mW/12479mW VDD_CPU_GPU_CV 4667mW/4667mW VDD_SOC 2817mW/2817mW",
wantMetrics: &system.GPUData{ wantMetrics: &system.GPUData{
Name: "Jetson", Name: "GPU",
MemoryUsed: 6185.0, MemoryUsed: 6185.0,
MemoryTotal: 7620.0, MemoryTotal: 7620.0,
Usage: 63.0, Usage: 63.0,
@@ -284,7 +283,7 @@ func TestParseJetsonData(t *testing.T) {
name: "missing temperature", name: "missing temperature",
input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW", input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW",
wantMetrics: &system.GPUData{ wantMetrics: &system.GPUData{
Name: "Jetson", Name: "GPU",
MemoryUsed: 4300.0, MemoryUsed: 4300.0,
MemoryTotal: 30698.0, MemoryTotal: 30698.0,
Usage: 45.0, Usage: 45.0,
@@ -292,32 +291,18 @@ func TestParseJetsonData(t *testing.T) {
Count: 1, Count: 1,
}, },
}, },
{
name: "no gpu defined by nvidia-smi",
input: "11-14-2024 22:54:33 RAM 4300/30698MB GR3D_FREQ 45% VDD_GPU_SOC 2171mW",
gm: &GPUManager{
GpuDataMap: map[string]*system.GPUData{},
},
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
if tt.gm != nil { gm := &GPUManager{
// should return if no gpu set by nvidia-smi GpuDataMap: make(map[string]*system.GPUData),
assert.Empty(t, tt.gm.GpuDataMap)
return
} }
tt.gm = &GPUManager{ parser := gm.getJetsonParser()
GpuDataMap: map[string]*system.GPUData{
"0": {Name: "Jetson"},
},
}
parser := tt.gm.getJetsonParser()
valid := parser([]byte(tt.input)) valid := parser([]byte(tt.input))
assert.Equal(t, true, valid) assert.Equal(t, true, valid)
got := tt.gm.GpuDataMap["0"] got := gm.GpuDataMap["0"]
require.NotNil(t, got) require.NotNil(t, got)
assert.Equal(t, tt.wantMetrics.Name, got.Name) assert.Equal(t, tt.wantMetrics.Name, got.Name)
assert.InDelta(t, tt.wantMetrics.MemoryUsed, got.MemoryUsed, 0.01) assert.InDelta(t, tt.wantMetrics.MemoryUsed, got.MemoryUsed, 0.01)
@@ -443,7 +428,7 @@ echo "test"`
} }
return nil return nil
}, },
wantNvidiaSmi: true, wantNvidiaSmi: false,
wantRocmSmi: true, wantRocmSmi: true,
wantTegrastats: true, wantTegrastats: true,
wantErr: false, wantErr: false,