diff --git a/beszel/internal/agent/agent.go b/beszel/internal/agent/agent.go index 7440bd1..a381bc0 100644 --- a/beszel/internal/agent/agent.go +++ b/beszel/internal/agent/agent.go @@ -78,7 +78,7 @@ func (a *Agent) Run(pubKey []byte, addr string) { // initialize GPU manager if os.Getenv("GPU") == "true" { if gm, err := NewGPUManager(); err != nil { - slog.Error("GPU manager", "err", err) + slog.Warn("GPU", "err", err) } else { a.gpuManager = gm } diff --git a/beszel/internal/agent/gpu.go b/beszel/internal/agent/gpu.go index de064fb..c36070f 100644 --- a/beszel/internal/agent/gpu.go +++ b/beszel/internal/agent/gpu.go @@ -93,9 +93,9 @@ func (gm *GPUManager) parseNvidiaData(output []byte) { } // update gpu data gpu := gm.GpuDataMap[id] - gpu.Temperature += temp - gpu.MemoryUsed += memoryUsage / 1.024 - gpu.MemoryTotal += totalMemory / 1.024 + gpu.Temperature = temp + gpu.MemoryUsed = memoryUsage / 1.024 + gpu.MemoryTotal = totalMemory / 1.024 gpu.Usage += usage gpu.Power += power gpu.Count++ @@ -120,7 +120,7 @@ func (gm *GPUManager) startAmdCollector() { // collectAmdStats runs rocm-smi in a loop and passes the output to parseAmdData func (gm *GPUManager) collectAmdStats() error { - cmd := exec.Command("/bin/sh", "-c", "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 4.7; done") + cmd := exec.Command("/bin/sh", "-c", "while true; do rocm-smi --showid --showtemp --showuse --showpower --showproductname --showmeminfo vram --json; sleep 3.7; done") // Set up a pipe to capture stdout stdout, err := cmd.StdoutPipe() if err != nil { @@ -168,9 +168,9 @@ func (gm *GPUManager) parseAmdData(rocmSmiInfo *map[string]RocmSmiJson) { gm.GpuDataMap[v.ID] = &system.GPUData{Name: v.Name} } gpu := gm.GpuDataMap[v.ID] - gpu.Temperature += temp - gpu.MemoryUsed += memoryUsage - gpu.MemoryTotal += totalMemory + gpu.Temperature = temp + gpu.MemoryUsed = memoryUsage + gpu.MemoryTotal = totalMemory gpu.Usage += usage gpu.Power += power gpu.Count++ @@ -185,19 +185,14 @@ func (gm *GPUManager) GetCurrentData() map[string]system.GPUData { gpuData := make(map[string]system.GPUData, len(gm.GpuDataMap)) for id, gpu := range gm.GpuDataMap { // sum the data - gpu.Temperature = twoDecimals(gpu.Temperature / gpu.Count) - gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed / gpu.Count) - gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal / gpu.Count) + gpu.Temperature = twoDecimals(gpu.Temperature) + gpu.MemoryUsed = twoDecimals(gpu.MemoryUsed) + gpu.MemoryTotal = twoDecimals(gpu.MemoryTotal) gpu.Usage = twoDecimals(gpu.Usage / gpu.Count) gpu.Power = twoDecimals(gpu.Power / gpu.Count) gpuData[id] = *gpu - // reset the data - gpu.Temperature = 0 - gpu.MemoryUsed = 0 - gpu.MemoryTotal = 0 - gpu.Usage = 0 - gpu.Power = 0 - gpu.Count = 0 + // reset the count + gpu.Count = 1 } return gpuData } diff --git a/beszel/site/src/components/charts/gpu-power-chart.tsx b/beszel/site/src/components/charts/gpu-power-chart.tsx new file mode 100644 index 0000000..3ef3700 --- /dev/null +++ b/beszel/site/src/components/charts/gpu-power-chart.tsx @@ -0,0 +1,112 @@ +import { CartesianGrid, Line, LineChart, YAxis } from "recharts" + +import { + ChartContainer, + ChartLegend, + ChartLegendContent, + ChartTooltip, + ChartTooltipContent, + xAxis, +} from "@/components/ui/chart" +import { + useYAxisWidth, + cn, + formatShortDate, + toFixedWithoutTrailingZeros, + decimalString, + chartMargin, +} from "@/lib/utils" +import { ChartData } from "@/types" +import { memo, useMemo } from "react" + +export default memo(function GpuPowerChart({ chartData }: { chartData: ChartData }) { + const { yAxisWidth, updateYAxisWidth } = useYAxisWidth() + + if (chartData.systemStats.length === 0) { + return null + } + + /** Format temperature data for chart and assign colors */ + const newChartData = useMemo(() => { + const newChartData = { data: [], colors: {} } as { + data: Record[] + colors: Record + } + const powerSums = {} as Record + for (let data of chartData.systemStats) { + let newData = { created: data.created } as Record + + for (let gpu of Object.values(data.stats?.g ?? {})) { + if (gpu.p) { + const name = gpu.n + newData[name] = gpu.p + powerSums[name] = (powerSums[name] ?? 0) + newData[name] + } + } + newChartData.data.push(newData) + } + const keys = Object.keys(powerSums).sort((a, b) => powerSums[b] - powerSums[a]) + for (let key of keys) { + newChartData.colors[key] = `hsl(${((keys.indexOf(key) * 360) / keys.length) % 360}, 60%, 55%)` + } + return newChartData + }, [chartData]) + + const colors = Object.keys(newChartData.colors) + + // console.log('rendered at', new Date()) + + return ( +
+ + + + { + const val = toFixedWithoutTrailingZeros(value, 2) + return updateYAxisWidth(val + "W") + }} + tickLine={false} + axisLine={false} + /> + {xAxis(chartData)} + b.value - a.value} + content={ + formatShortDate(data[0].payload.created)} + contentFormatter={(item) => decimalString(item.value) + "W"} + // indicator="line" + /> + } + /> + {colors.map((key) => ( + + ))} + {colors.length < 12 && } />} + + +
+ ) +}) diff --git a/beszel/site/src/components/routes/system.tsx b/beszel/site/src/components/routes/system.tsx index 923e291..67d10e0 100644 --- a/beszel/site/src/components/routes/system.tsx +++ b/beszel/site/src/components/routes/system.tsx @@ -24,6 +24,7 @@ const MemChart = lazy(() => import("../charts/mem-chart")) const DiskChart = lazy(() => import("../charts/disk-chart")) const SwapChart = lazy(() => import("../charts/swap-chart")) const TemperatureChart = lazy(() => import("../charts/temperature-chart")) +const GpuPowerChart = lazy(() => import("../charts/gpu-power-chart")) const cache = new Map() @@ -285,6 +286,7 @@ export default function SystemDetail({ name }: { name: string }) { // if no data, show empty message const dataEmpty = !chartLoading && chartData.systemStats.length === 0 + const hasGpuData = Object.keys(systemStats.at(-1)?.stats.g ?? {}).length > 0 return ( <> @@ -455,6 +457,7 @@ export default function SystemDetail({ name }: { name: string }) { )} + {/* Swap chart */} {(systemStats.at(-1)?.stats.su ?? 0) > 0 && ( )} + {/* Temperature chart */} {systemStats.at(-1)?.stats.t && ( )} + + {/* GPU power draw chart */} + {hasGpuData && ( + + + + )} {/* GPU charts */} - {Object.keys(systemStats.at(-1)?.stats.g ?? {}).length > 0 && ( + {hasGpuData && (
{Object.keys(systemStats.at(-1)?.stats.g ?? {}).map((id) => { const gpu = systemStats.at(-1)?.stats.g?.[id] as GPUData @@ -489,7 +505,7 @@ export default function SystemDetail({ name }: { name: string }) { empty={dataEmpty} grid={grid} title={`${gpu.n} ${t`Usage`}`} - description={t`Total utilization of ${gpu.n}`} + description={`Average utilization of ${gpu.n}`} > @@ -497,7 +513,7 @@ export default function SystemDetail({ name }: { name: string }) { empty={dataEmpty} grid={grid} title={`${gpu.n} VRAM`} - description={t`VRAM usage of ${gpu.n}`} + description={t`Precise utilization at the recorded time`} >