package alerts import ( "encoding/json" "fmt" "strings" "time" "github.com/henrygd/beszel/internal/entities/system" "github.com/pocketbase/dbx" "github.com/pocketbase/pocketbase/core" "github.com/pocketbase/pocketbase/tools/types" "github.com/spf13/cast" ) func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *system.CombinedData) error { alertRecords, err := am.hub.FindAllRecords("alerts", dbx.NewExp("system={:system} AND name!='Status'", dbx.Params{"system": systemRecord.Id}), ) if err != nil || len(alertRecords) == 0 { // log.Println("no alerts found for system") return nil } var validAlerts []SystemAlertData now := systemRecord.GetDateTime("updated").Time().UTC() oldestTime := now for _, alertRecord := range alertRecords { name := alertRecord.GetString("name") var val float64 unit := "%" switch name { case "CPU": val = data.Info.Cpu case "Memory": val = data.Info.MemPct case "Bandwidth": val = data.Info.Bandwidth unit = " MB/s" case "Disk": maxUsedPct := data.Info.DiskPct for _, fs := range data.Stats.ExtraFs { usedPct := fs.DiskUsed / fs.DiskTotal * 100 if usedPct > maxUsedPct { maxUsedPct = usedPct } } val = maxUsedPct case "Temperature": if data.Info.DashboardTemp < 1 { continue } val = data.Info.DashboardTemp unit = "°C" case "LoadAvg1": val = data.Info.LoadAvg[0] unit = "" case "LoadAvg5": val = data.Info.LoadAvg[1] unit = "" case "LoadAvg15": val = data.Info.LoadAvg[2] unit = "" } triggered := alertRecord.GetBool("triggered") threshold := alertRecord.GetFloat("value") // CONTINUE // IF alert is not triggered and curValue is less than threshold // OR alert is triggered and curValue is greater than threshold if (!triggered && val <= threshold) || (triggered && val > threshold) { // log.Printf("Skipping alert %s: val %f | threshold %f | triggered %v\n", name, val, threshold, triggered) continue } min := max(1, cast.ToUint8(alertRecord.Get("min"))) alert := SystemAlertData{ systemRecord: systemRecord, alertRecord: alertRecord, name: name, unit: unit, val: val, threshold: threshold, triggered: triggered, min: min, } // send alert immediately if min is 1 - no need to sum up values. if min == 1 { alert.triggered = val > threshold go am.sendSystemAlert(alert) continue } alert.time = now.Add(-time.Duration(min) * time.Minute) if alert.time.Before(oldestTime) { oldestTime = alert.time } validAlerts = append(validAlerts, alert) } systemStats := []struct { Stats []byte `db:"stats"` Created types.DateTime `db:"created"` }{} err = am.hub.DB(). Select("stats", "created"). From("system_stats"). Where(dbx.NewExp( "system={:system} AND type='1m' AND created > {:created}", dbx.Params{ "system": systemRecord.Id, // subtract some time to give us a bit of buffer "created": oldestTime.Add(-time.Second * 90), }, )). OrderBy("created"). All(&systemStats) if err != nil || len(systemStats) == 0 { return err } // get oldest record creation time from first record in the slice oldestRecordTime := systemStats[0].Created.Time() // log.Println("oldestRecordTime", oldestRecordTime.String()) // Filter validAlerts to keep only those with time newer than oldestRecord filteredAlerts := make([]SystemAlertData, 0, len(validAlerts)) for _, alert := range validAlerts { if alert.time.After(oldestRecordTime) { filteredAlerts = append(filteredAlerts, alert) } } validAlerts = filteredAlerts if len(validAlerts) == 0 { // log.Println("no valid alerts found") return nil } var stats SystemAlertStats // we can skip the latest systemStats record since it's the current value for i := range systemStats { stat := systemStats[i] // subtract 10 seconds to give a small time buffer systemStatsCreation := stat.Created.Time().Add(-time.Second * 10) if err := json.Unmarshal(stat.Stats, &stats); err != nil { return err } // log.Println("stats", stats) for j := range validAlerts { alert := &validAlerts[j] // reset alert val on first iteration if i == 0 { alert.val = 0 } // continue if system_stats is older than alert time range if systemStatsCreation.Before(alert.time) { continue } // add to alert value switch alert.name { case "CPU": alert.val += stats.Cpu case "Memory": alert.val += stats.Mem case "Bandwidth": alert.val += stats.NetSent + stats.NetRecv case "Disk": if alert.mapSums == nil { alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1) } // add root disk if _, ok := alert.mapSums["root"]; !ok { alert.mapSums["root"] = 0.0 } alert.mapSums["root"] += float32(stats.Disk) // add extra disks for key, fs := range data.Stats.ExtraFs { if _, ok := alert.mapSums[key]; !ok { alert.mapSums[key] = 0.0 } alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100) } case "Temperature": if alert.mapSums == nil { alert.mapSums = make(map[string]float32, len(stats.Temperatures)) } for key, temp := range stats.Temperatures { if _, ok := alert.mapSums[key]; !ok { alert.mapSums[key] = float32(0) } alert.mapSums[key] += temp } case "LoadAvg1": alert.val += stats.LoadAvg[0] case "LoadAvg5": alert.val += stats.LoadAvg[1] case "LoadAvg15": alert.val += stats.LoadAvg[2] default: continue } alert.count++ } } // sum up vals for each alert for _, alert := range validAlerts { switch alert.name { case "Disk": maxPct := float32(0) for key, value := range alert.mapSums { sumPct := float32(value) if sumPct > maxPct { maxPct = sumPct alert.descriptor = fmt.Sprintf("Usage of %s", key) } } alert.val = float64(maxPct / float32(alert.count)) case "Temperature": maxTemp := float32(0) for key, value := range alert.mapSums { sumTemp := float32(value) / float32(alert.count) if sumTemp > maxTemp { maxTemp = sumTemp alert.descriptor = fmt.Sprintf("Highest sensor %s", key) } } alert.val = float64(maxTemp) default: alert.val = alert.val / float64(alert.count) } minCount := float32(alert.min) / 1.2 // log.Println("alert", alert.name, "val", alert.val, "threshold", alert.threshold, "triggered", alert.triggered) // log.Printf("%s: val %f | count %d | min-count %f | threshold %f\n", alert.name, alert.val, alert.count, minCount, alert.threshold) // pass through alert if count is greater than or equal to minCount if float32(alert.count) >= minCount { if !alert.triggered && alert.val > alert.threshold { alert.triggered = true go am.sendSystemAlert(alert) } else if alert.triggered && alert.val <= alert.threshold { alert.triggered = false go am.sendSystemAlert(alert) } } } return nil } func (am *AlertManager) sendSystemAlert(alert SystemAlertData) { // log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold) systemName := alert.systemRecord.GetString("name") // change Disk to Disk usage if alert.name == "Disk" { alert.name += " usage" } // format LoadAvg5 and LoadAvg15 if after, ok := strings.CutPrefix(alert.name, "LoadAvg"); ok { alert.name = after + "m Load" } // make title alert name lowercase if not CPU titleAlertName := alert.name if titleAlertName != "CPU" { titleAlertName = strings.ToLower(titleAlertName) } var subject string if alert.triggered { subject = fmt.Sprintf("%s %s above threshold", systemName, titleAlertName) } else { subject = fmt.Sprintf("%s %s below threshold", systemName, titleAlertName) } minutesLabel := "minute" if alert.min > 1 { minutesLabel += "s" } if alert.descriptor == "" { alert.descriptor = alert.name } body := fmt.Sprintf("%s averaged %.2f%s for the previous %v %s.", alert.descriptor, alert.val, alert.unit, alert.min, minutesLabel) alert.alertRecord.Set("triggered", alert.triggered) if err := am.hub.Save(alert.alertRecord); err != nil { // app.Logger().Error("failed to save alert record", "err", err) return } am.SendAlert(AlertMessageData{ UserID: alert.alertRecord.GetString("user"), Title: subject, Message: body, Link: am.hub.MakeLink("system", systemName), LinkText: "View " + systemName, }) }