agent temperature fixes (#648, #663)

- Fixes a bad sensor returning an error instead of other good sensors
- Adds ability to set GPU as PRIMARY_SENSOR
This commit is contained in:
henrygd
2025-03-15 00:29:41 -04:00
parent 5837b4f25c
commit 968ca70670
2 changed files with 15 additions and 22 deletions

View File

@@ -26,6 +26,7 @@ type Agent struct {
dockerManager *dockerManager // Manages Docker API requests dockerManager *dockerManager // Manages Docker API requests
sensorsContext context.Context // Sensors context to override sys location sensorsContext context.Context // Sensors context to override sys location
sensorsWhitelist map[string]struct{} // List of sensors to monitor sensorsWhitelist map[string]struct{} // List of sensors to monitor
primarySensor string // Value of PRIMARY_SENSOR env var
systemInfo system.Info // Host system info systemInfo system.Info // Host system info
gpuManager *GPUManager // Manages GPU data gpuManager *GPUManager // Manages GPU data
cache *SessionCache // Cache for system stats based on primary session ID cache *SessionCache // Cache for system stats based on primary session ID
@@ -38,7 +39,7 @@ func NewAgent() *Agent {
cache: NewSessionCache(69 * time.Second), cache: NewSessionCache(69 * time.Second),
} }
agent.memCalc, _ = GetEnv("MEM_CALC") agent.memCalc, _ = GetEnv("MEM_CALC")
agent.primarySensor, _ = GetEnv("PRIMARY_SENSOR")
// Set up slog with a log level determined by the LOG_LEVEL env var // Set up slog with a log level determined by the LOG_LEVEL env var
if logLevelStr, exists := GetEnv("LOG_LEVEL"); exists { if logLevelStr, exists := GetEnv("LOG_LEVEL"); exists {
switch strings.ToLower(logLevelStr) { switch strings.ToLower(logLevelStr) {

View File

@@ -184,11 +184,9 @@ func (a *Agent) getSystemStats() system.Stats {
} }
} }
// temperatures (skip if sensors whitelist is set to empty string) // temperatures
err = a.updateTemperatures(&systemStats) // TODO: maybe refactor to methods on systemStats
if err != nil { a.updateTemperatures(&systemStats)
slog.Error("Error getting temperatures", "err", fmt.Sprintf("%+v", err))
}
// GPU data // GPU data
if a.gpuManager != nil { if a.gpuManager != nil {
@@ -205,6 +203,9 @@ func (a *Agent) getSystemStats() system.Stats {
for _, gpu := range gpuData { for _, gpu := range gpuData {
if gpu.Temperature > 0 { if gpu.Temperature > 0 {
systemStats.Temperatures[gpu.Name] = gpu.Temperature systemStats.Temperatures[gpu.Name] = gpu.Temperature
if a.primarySensor == gpu.Name {
a.systemInfo.DashboardTemp = gpu.Temperature
}
} }
// update high gpu percent for dashboard // update high gpu percent for dashboard
a.systemInfo.GpuPct = max(a.systemInfo.GpuPct, gpu.Usage) a.systemInfo.GpuPct = max(a.systemInfo.GpuPct, gpu.Usage)
@@ -223,29 +224,23 @@ func (a *Agent) getSystemStats() system.Stats {
return systemStats return systemStats
} }
func (a *Agent) updateTemperatures(systemStats *system.Stats) error { func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// skip if sensors whitelist is set to empty string // skip if sensors whitelist is set to empty string
if a.sensorsWhitelist != nil && len(a.sensorsWhitelist) == 0 { if a.sensorsWhitelist != nil && len(a.sensorsWhitelist) == 0 {
slog.Debug("Skipping temperature collection") slog.Debug("Skipping temperature collection")
return nil return
} }
primarySensor, primarySensorIsDefined := GetEnv("PRIMARY_SENSOR")
// reset high temp // reset high temp
a.systemInfo.DashboardTemp = 0 a.systemInfo.DashboardTemp = 0
// get sensor data // get sensor data
temps, err := sensors.TemperaturesWithContext(a.sensorsContext) temps, _ := sensors.TemperaturesWithContext(a.sensorsContext)
if err != nil {
slog.Error("Error getting temperatures", "err", fmt.Sprintf("%+v", err))
return err
}
slog.Debug("Temperature", "sensors", temps) slog.Debug("Temperature", "sensors", temps)
// return if no sensors // return if no sensors
if len(temps) == 0 { if len(temps) == 0 {
return nil return
} }
systemStats.Temperatures = make(map[string]float64, len(temps)) systemStats.Temperatures = make(map[string]float64, len(temps))
@@ -266,16 +261,13 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) error {
} }
} }
// set dashboard temperature // set dashboard temperature
if primarySensorIsDefined { if a.primarySensor == "" {
if sensorName == primarySensor {
a.systemInfo.DashboardTemp = sensor.Temperature
}
} else {
a.systemInfo.DashboardTemp = max(a.systemInfo.DashboardTemp, sensor.Temperature) a.systemInfo.DashboardTemp = max(a.systemInfo.DashboardTemp, sensor.Temperature)
} else if a.primarySensor == sensorName {
a.systemInfo.DashboardTemp = sensor.Temperature
} }
systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature) systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature)
} }
return nil
} }
// Returns the size of the ZFS ARC memory cache in bytes // Returns the size of the ZFS ARC memory cache in bytes