From 6b6d3fabc071a9f1c866cc6afde1a742c5200b7f Mon Sep 17 00:00:00 2001 From: Henry Dollman Date: Wed, 16 Oct 2024 17:21:05 -0400 Subject: [PATCH] change disk alert to monitor usage of any disk, not only root --- beszel/internal/alerts/alerts.go | 89 +++++++++++++++++++++----------- beszel/internal/hub/hub.go | 4 +- beszel/site/src/lib/utils.ts | 4 +- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/beszel/internal/alerts/alerts.go b/beszel/internal/alerts/alerts.go index 6f34fd9..e872479 100644 --- a/beszel/internal/alerts/alerts.go +++ b/beszel/internal/alerts/alerts.go @@ -4,7 +4,6 @@ package alerts import ( "beszel/internal/entities/system" "fmt" - "log" "net/mail" "net/url" "time" @@ -58,7 +57,7 @@ type SystemAlertData struct { time time.Time count uint8 min uint8 - tempSums map[string]float32 + mapSums map[string]float32 descriptor string // override descriptor in notification body (for temp sensor, disk partition, etc) } @@ -68,18 +67,17 @@ func NewAlertManager(app *pocketbase.PocketBase) *AlertManager { } } -func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemInfo system.Info, temperatures map[string]float64) { +func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemInfo system.Info, temperatures map[string]float64, extraFs map[string]*system.FsStats) error { // start := time.Now() // defer func() { // log.Println("alert stats took", time.Since(start)) // }() - alertRecords, err := am.app.Dao().FindRecordsByExpr("alerts", - dbx.NewExp("system={:system}", dbx.Params{"system": systemRecord.GetId()}), + dbx.NewExp("system={:system}", dbx.Params{"system": systemRecord.Id}), ) if err != nil || len(alertRecords) == 0 { // log.Println("no alerts found for system") - return + return nil } var validAlerts []SystemAlertData @@ -96,11 +94,18 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn val = systemInfo.Cpu case "Memory": val = systemInfo.MemPct - case "Disk": - val = systemInfo.DiskPct case "Bandwidth": val = systemInfo.Bandwidth - unit = "MB/s" + unit = " MB/s" + case "Disk": + maxUsedPct := systemInfo.DiskPct + for _, fs := range extraFs { + usedPct := fs.DiskUsed / fs.DiskTotal * 100 + if usedPct > maxUsedPct { + maxUsedPct = usedPct + } + } + val = maxUsedPct case "Temperature": if temperatures == nil { continue @@ -126,7 +131,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn min := max(1, cast.ToUint8(alertRecord.Get("min"))) // add time to alert time to make sure it's slighty after record creation - time := now.Add(-time.Duration(min)*time.Minute + time.Second*5) + time := now.Add(-time.Duration(min) * time.Minute) if time.Before(oldestTime) { oldestTime = time } @@ -164,7 +169,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn All(&systemStats) if err != nil { - return + return err } // get oldest record creation time from first record in the slice @@ -181,7 +186,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn if len(validAlerts) == 0 { // log.Println("no valid alerts found") - return + return nil } var stats SystemAlertStats @@ -189,9 +194,11 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn // we can skip the latest systemStats record since it's the current value for i := 0; i < len(systemStats); i++ { stat := systemStats[i] - // log.Println("created", stat.Created.Time(), "now", time.Now().UTC()) - statTime := stat.Created.Time().Add(time.Second) - json.Unmarshal(stat.Stats, &stats) + // subtract 10 seconds to give a small time buffer + systemStatsCreation := stat.Created.Time().Add(-time.Second * 10) + if err := json.Unmarshal(stat.Stats, &stats); err != nil { + return err + } // log.Println("stats", stats) for j := range validAlerts { alert := &validAlerts[j] @@ -199,8 +206,8 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn if i == 0 { alert.val = 0 } - // continue if stat is older than alert time range - if statTime.Before(alert.time) { + // continue if system_stats is older than alert time range + if systemStatsCreation.Before(alert.time) { continue } // add to alert value @@ -212,17 +219,30 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn case "Bandwidth": alert.val += stats.NetSent + stats.NetRecv case "Disk": - // todo: check all disks instead of just root - alert.val += stats.Disk - case "Temperature": - if alert.tempSums == nil { - alert.tempSums = make(map[string]float32, len(stats.Temperatures)) + if alert.mapSums == nil { + alert.mapSums = make(map[string]float32, len(extraFs)+1) } - for key, value := range stats.Temperatures { - if _, ok := alert.tempSums[key]; !ok { - alert.tempSums[key] = float32(0) + // add root disk + if _, ok := alert.mapSums["root"]; !ok { + alert.mapSums["root"] = 0.0 + } + alert.mapSums["root"] += float32(stats.Disk) + // add extra disks + for key, fs := range extraFs { + if _, ok := alert.mapSums[key]; !ok { + alert.mapSums[key] = 0.0 } - alert.tempSums[key] += value + alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100) + } + case "Temperature": + if alert.mapSums == nil { + alert.mapSums = make(map[string]float32, len(stats.Temperatures)) + } + for key, temp := range stats.Temperatures { + if _, ok := alert.mapSums[key]; !ok { + alert.mapSums[key] = float32(0) + } + alert.mapSums[key] += temp } default: continue @@ -233,13 +253,23 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn // sum up vals for each alert for _, alert := range validAlerts { switch alert.name { + case "Disk": + maxPct := float32(0) + for key, value := range alert.mapSums { + sumPct := float32(value) + if sumPct > maxPct { + maxPct = sumPct + alert.descriptor = fmt.Sprintf("Usage of %s", key) + } + } + alert.val = float64(maxPct / float32(alert.count)) case "Temperature": maxTemp := float32(0) - for key, value := range alert.tempSums { + for key, value := range alert.mapSums { sumTemp := float32(value) / float32(alert.count) if sumTemp > maxTemp { maxTemp = sumTemp - alert.descriptor = fmt.Sprintf("Hottest sensor %s", key) + alert.descriptor = fmt.Sprintf("Highest sensor %s", key) } } alert.val = float64(maxTemp) @@ -260,10 +290,11 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn } } } + return nil } func (am *AlertManager) sendSystemAlert(alert SystemAlertData) { - log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold) + // log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold) systemName := alert.systemRecord.GetString("name") diff --git a/beszel/internal/hub/hub.go b/beszel/internal/hub/hub.go index fa47159..4871f59 100644 --- a/beszel/internal/hub/hub.go +++ b/beszel/internal/hub/hub.go @@ -315,7 +315,9 @@ func (h *Hub) updateSystem(record *models.Record) { } } // system info alerts (todo: extra fs alerts) - h.am.HandleSystemAlerts(record, systemData.Info, systemData.Stats.Temperatures) + if err := h.am.HandleSystemAlerts(record, systemData.Info, systemData.Stats.Temperatures, systemData.Stats.ExtraFs); err != nil { + h.app.Logger().Error("System alerts error", "err", err.Error()) + } } // set system to specified status and save record diff --git a/beszel/site/src/lib/utils.ts b/beszel/site/src/lib/utils.ts index 73f9989..3f00c44 100644 --- a/beszel/site/src/lib/utils.ts +++ b/beszel/site/src/lib/utils.ts @@ -54,7 +54,7 @@ export const updateSystemList = async () => { export const updateAlerts = () => { pb.collection('alerts') - .getFullList({ fields: 'id,name,system,value,min,triggered' }) + .getFullList({ fields: 'id,name,system,value,min,triggered', sort: 'updated' }) .then((records) => { $alerts.set(records) }) @@ -315,7 +315,7 @@ export const alertInfo = { name: 'Disk usage', unit: '%', icon: HardDriveIcon, - desc: 'Triggers when root usage exceeds a threshold.', + desc: 'Triggers when usage of any disk exceeds a threshold.', }, Bandwidth: { name: 'Bandwidth',