agent temperature fixes (#648, #663)

- Fixes a bad sensor returning an error instead of other good sensors
- Adds ability to set GPU as PRIMARY_SENSOR
This commit is contained in:
henrygd
2025-03-15 00:29:41 -04:00
parent 5837b4f25c
commit 968ca70670
2 changed files with 15 additions and 22 deletions

View File

@@ -26,6 +26,7 @@ type Agent struct {
dockerManager *dockerManager // Manages Docker API requests
sensorsContext context.Context // Sensors context to override sys location
sensorsWhitelist map[string]struct{} // List of sensors to monitor
primarySensor string // Value of PRIMARY_SENSOR env var
systemInfo system.Info // Host system info
gpuManager *GPUManager // Manages GPU data
cache *SessionCache // Cache for system stats based on primary session ID
@@ -38,7 +39,7 @@ func NewAgent() *Agent {
cache: NewSessionCache(69 * time.Second),
}
agent.memCalc, _ = GetEnv("MEM_CALC")
agent.primarySensor, _ = GetEnv("PRIMARY_SENSOR")
// Set up slog with a log level determined by the LOG_LEVEL env var
if logLevelStr, exists := GetEnv("LOG_LEVEL"); exists {
switch strings.ToLower(logLevelStr) {

View File

@@ -184,11 +184,9 @@ func (a *Agent) getSystemStats() system.Stats {
}
}
// temperatures (skip if sensors whitelist is set to empty string)
err = a.updateTemperatures(&systemStats)
if err != nil {
slog.Error("Error getting temperatures", "err", fmt.Sprintf("%+v", err))
}
// temperatures
// TODO: maybe refactor to methods on systemStats
a.updateTemperatures(&systemStats)
// GPU data
if a.gpuManager != nil {
@@ -205,6 +203,9 @@ func (a *Agent) getSystemStats() system.Stats {
for _, gpu := range gpuData {
if gpu.Temperature > 0 {
systemStats.Temperatures[gpu.Name] = gpu.Temperature
if a.primarySensor == gpu.Name {
a.systemInfo.DashboardTemp = gpu.Temperature
}
}
// update high gpu percent for dashboard
a.systemInfo.GpuPct = max(a.systemInfo.GpuPct, gpu.Usage)
@@ -223,29 +224,23 @@ func (a *Agent) getSystemStats() system.Stats {
return systemStats
}
func (a *Agent) updateTemperatures(systemStats *system.Stats) error {
func (a *Agent) updateTemperatures(systemStats *system.Stats) {
// skip if sensors whitelist is set to empty string
if a.sensorsWhitelist != nil && len(a.sensorsWhitelist) == 0 {
slog.Debug("Skipping temperature collection")
return nil
return
}
primarySensor, primarySensorIsDefined := GetEnv("PRIMARY_SENSOR")
// reset high temp
a.systemInfo.DashboardTemp = 0
// get sensor data
temps, err := sensors.TemperaturesWithContext(a.sensorsContext)
if err != nil {
slog.Error("Error getting temperatures", "err", fmt.Sprintf("%+v", err))
return err
}
temps, _ := sensors.TemperaturesWithContext(a.sensorsContext)
slog.Debug("Temperature", "sensors", temps)
// return if no sensors
if len(temps) == 0 {
return nil
return
}
systemStats.Temperatures = make(map[string]float64, len(temps))
@@ -266,16 +261,13 @@ func (a *Agent) updateTemperatures(systemStats *system.Stats) error {
}
}
// set dashboard temperature
if primarySensorIsDefined {
if sensorName == primarySensor {
a.systemInfo.DashboardTemp = sensor.Temperature
}
} else {
if a.primarySensor == "" {
a.systemInfo.DashboardTemp = max(a.systemInfo.DashboardTemp, sensor.Temperature)
} else if a.primarySensor == sensorName {
a.systemInfo.DashboardTemp = sensor.Temperature
}
systemStats.Temperatures[sensorName] = twoDecimals(sensor.Temperature)
}
return nil
}
// Returns the size of the ZFS ARC memory cache in bytes