mirror of
https://github.com/fankes/beszel.git
synced 2025-10-19 01:39:34 +08:00
rename /src to /internal (sorry i'll fix the prs)
This commit is contained in:
304
internal/alerts/alerts_system.go
Normal file
304
internal/alerts/alerts_system.go
Normal file
@@ -0,0 +1,304 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/henrygd/beszel/src/entities/system"
|
||||
|
||||
"github.com/pocketbase/dbx"
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
"github.com/pocketbase/pocketbase/tools/types"
|
||||
"github.com/spf13/cast"
|
||||
)
|
||||
|
||||
func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, data *system.CombinedData) error {
|
||||
alertRecords, err := am.hub.FindAllRecords("alerts",
|
||||
dbx.NewExp("system={:system} AND name!='Status'", dbx.Params{"system": systemRecord.Id}),
|
||||
)
|
||||
if err != nil || len(alertRecords) == 0 {
|
||||
// log.Println("no alerts found for system")
|
||||
return nil
|
||||
}
|
||||
|
||||
var validAlerts []SystemAlertData
|
||||
now := systemRecord.GetDateTime("updated").Time().UTC()
|
||||
oldestTime := now
|
||||
|
||||
for _, alertRecord := range alertRecords {
|
||||
name := alertRecord.GetString("name")
|
||||
var val float64
|
||||
unit := "%"
|
||||
|
||||
switch name {
|
||||
case "CPU":
|
||||
val = data.Info.Cpu
|
||||
case "Memory":
|
||||
val = data.Info.MemPct
|
||||
case "Bandwidth":
|
||||
val = data.Info.Bandwidth
|
||||
unit = " MB/s"
|
||||
case "Disk":
|
||||
maxUsedPct := data.Info.DiskPct
|
||||
for _, fs := range data.Stats.ExtraFs {
|
||||
usedPct := fs.DiskUsed / fs.DiskTotal * 100
|
||||
if usedPct > maxUsedPct {
|
||||
maxUsedPct = usedPct
|
||||
}
|
||||
}
|
||||
val = maxUsedPct
|
||||
case "Temperature":
|
||||
if data.Info.DashboardTemp < 1 {
|
||||
continue
|
||||
}
|
||||
val = data.Info.DashboardTemp
|
||||
unit = "°C"
|
||||
case "LoadAvg1":
|
||||
val = data.Info.LoadAvg[0]
|
||||
unit = ""
|
||||
case "LoadAvg5":
|
||||
val = data.Info.LoadAvg[1]
|
||||
unit = ""
|
||||
case "LoadAvg15":
|
||||
val = data.Info.LoadAvg[2]
|
||||
unit = ""
|
||||
}
|
||||
|
||||
triggered := alertRecord.GetBool("triggered")
|
||||
threshold := alertRecord.GetFloat("value")
|
||||
|
||||
// CONTINUE
|
||||
// IF alert is not triggered and curValue is less than threshold
|
||||
// OR alert is triggered and curValue is greater than threshold
|
||||
if (!triggered && val <= threshold) || (triggered && val > threshold) {
|
||||
// log.Printf("Skipping alert %s: val %f | threshold %f | triggered %v\n", name, val, threshold, triggered)
|
||||
continue
|
||||
}
|
||||
|
||||
min := max(1, cast.ToUint8(alertRecord.Get("min")))
|
||||
|
||||
alert := SystemAlertData{
|
||||
systemRecord: systemRecord,
|
||||
alertRecord: alertRecord,
|
||||
name: name,
|
||||
unit: unit,
|
||||
val: val,
|
||||
threshold: threshold,
|
||||
triggered: triggered,
|
||||
min: min,
|
||||
}
|
||||
|
||||
// send alert immediately if min is 1 - no need to sum up values.
|
||||
if min == 1 {
|
||||
alert.triggered = val > threshold
|
||||
go am.sendSystemAlert(alert)
|
||||
continue
|
||||
}
|
||||
|
||||
alert.time = now.Add(-time.Duration(min) * time.Minute)
|
||||
if alert.time.Before(oldestTime) {
|
||||
oldestTime = alert.time
|
||||
}
|
||||
|
||||
validAlerts = append(validAlerts, alert)
|
||||
}
|
||||
|
||||
systemStats := []struct {
|
||||
Stats []byte `db:"stats"`
|
||||
Created types.DateTime `db:"created"`
|
||||
}{}
|
||||
|
||||
err = am.hub.DB().
|
||||
Select("stats", "created").
|
||||
From("system_stats").
|
||||
Where(dbx.NewExp(
|
||||
"system={:system} AND type='1m' AND created > {:created}",
|
||||
dbx.Params{
|
||||
"system": systemRecord.Id,
|
||||
// subtract some time to give us a bit of buffer
|
||||
"created": oldestTime.Add(-time.Second * 90),
|
||||
},
|
||||
)).
|
||||
OrderBy("created").
|
||||
All(&systemStats)
|
||||
if err != nil || len(systemStats) == 0 {
|
||||
return err
|
||||
}
|
||||
|
||||
// get oldest record creation time from first record in the slice
|
||||
oldestRecordTime := systemStats[0].Created.Time()
|
||||
// log.Println("oldestRecordTime", oldestRecordTime.String())
|
||||
|
||||
// Filter validAlerts to keep only those with time newer than oldestRecord
|
||||
filteredAlerts := make([]SystemAlertData, 0, len(validAlerts))
|
||||
for _, alert := range validAlerts {
|
||||
if alert.time.After(oldestRecordTime) {
|
||||
filteredAlerts = append(filteredAlerts, alert)
|
||||
}
|
||||
}
|
||||
validAlerts = filteredAlerts
|
||||
|
||||
if len(validAlerts) == 0 {
|
||||
// log.Println("no valid alerts found")
|
||||
return nil
|
||||
}
|
||||
|
||||
var stats SystemAlertStats
|
||||
|
||||
// we can skip the latest systemStats record since it's the current value
|
||||
for i := range systemStats {
|
||||
stat := systemStats[i]
|
||||
// subtract 10 seconds to give a small time buffer
|
||||
systemStatsCreation := stat.Created.Time().Add(-time.Second * 10)
|
||||
if err := json.Unmarshal(stat.Stats, &stats); err != nil {
|
||||
return err
|
||||
}
|
||||
// log.Println("stats", stats)
|
||||
for j := range validAlerts {
|
||||
alert := &validAlerts[j]
|
||||
// reset alert val on first iteration
|
||||
if i == 0 {
|
||||
alert.val = 0
|
||||
}
|
||||
// continue if system_stats is older than alert time range
|
||||
if systemStatsCreation.Before(alert.time) {
|
||||
continue
|
||||
}
|
||||
// add to alert value
|
||||
switch alert.name {
|
||||
case "CPU":
|
||||
alert.val += stats.Cpu
|
||||
case "Memory":
|
||||
alert.val += stats.Mem
|
||||
case "Bandwidth":
|
||||
alert.val += stats.NetSent + stats.NetRecv
|
||||
case "Disk":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(data.Stats.ExtraFs)+1)
|
||||
}
|
||||
// add root disk
|
||||
if _, ok := alert.mapSums["root"]; !ok {
|
||||
alert.mapSums["root"] = 0.0
|
||||
}
|
||||
alert.mapSums["root"] += float32(stats.Disk)
|
||||
// add extra disks
|
||||
for key, fs := range data.Stats.ExtraFs {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = 0.0
|
||||
}
|
||||
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
|
||||
}
|
||||
case "Temperature":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(stats.Temperatures))
|
||||
}
|
||||
for key, temp := range stats.Temperatures {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = float32(0)
|
||||
}
|
||||
alert.mapSums[key] += temp
|
||||
}
|
||||
case "LoadAvg1":
|
||||
alert.val += stats.LoadAvg[0]
|
||||
case "LoadAvg5":
|
||||
alert.val += stats.LoadAvg[1]
|
||||
case "LoadAvg15":
|
||||
alert.val += stats.LoadAvg[2]
|
||||
default:
|
||||
continue
|
||||
}
|
||||
alert.count++
|
||||
}
|
||||
}
|
||||
// sum up vals for each alert
|
||||
for _, alert := range validAlerts {
|
||||
switch alert.name {
|
||||
case "Disk":
|
||||
maxPct := float32(0)
|
||||
for key, value := range alert.mapSums {
|
||||
sumPct := float32(value)
|
||||
if sumPct > maxPct {
|
||||
maxPct = sumPct
|
||||
alert.descriptor = fmt.Sprintf("Usage of %s", key)
|
||||
}
|
||||
}
|
||||
alert.val = float64(maxPct / float32(alert.count))
|
||||
case "Temperature":
|
||||
maxTemp := float32(0)
|
||||
for key, value := range alert.mapSums {
|
||||
sumTemp := float32(value) / float32(alert.count)
|
||||
if sumTemp > maxTemp {
|
||||
maxTemp = sumTemp
|
||||
alert.descriptor = fmt.Sprintf("Highest sensor %s", key)
|
||||
}
|
||||
}
|
||||
alert.val = float64(maxTemp)
|
||||
default:
|
||||
alert.val = alert.val / float64(alert.count)
|
||||
}
|
||||
minCount := float32(alert.min) / 1.2
|
||||
// log.Println("alert", alert.name, "val", alert.val, "threshold", alert.threshold, "triggered", alert.triggered)
|
||||
// log.Printf("%s: val %f | count %d | min-count %f | threshold %f\n", alert.name, alert.val, alert.count, minCount, alert.threshold)
|
||||
// pass through alert if count is greater than or equal to minCount
|
||||
if float32(alert.count) >= minCount {
|
||||
if !alert.triggered && alert.val > alert.threshold {
|
||||
alert.triggered = true
|
||||
go am.sendSystemAlert(alert)
|
||||
} else if alert.triggered && alert.val <= alert.threshold {
|
||||
alert.triggered = false
|
||||
go am.sendSystemAlert(alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
|
||||
// log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold)
|
||||
systemName := alert.systemRecord.GetString("name")
|
||||
|
||||
// change Disk to Disk usage
|
||||
if alert.name == "Disk" {
|
||||
alert.name += " usage"
|
||||
}
|
||||
// format LoadAvg5 and LoadAvg15
|
||||
if after, ok := strings.CutPrefix(alert.name, "LoadAvg"); ok {
|
||||
alert.name = after + "m Load"
|
||||
}
|
||||
|
||||
// make title alert name lowercase if not CPU
|
||||
titleAlertName := alert.name
|
||||
if titleAlertName != "CPU" {
|
||||
titleAlertName = strings.ToLower(titleAlertName)
|
||||
}
|
||||
|
||||
var subject string
|
||||
if alert.triggered {
|
||||
subject = fmt.Sprintf("%s %s above threshold", systemName, titleAlertName)
|
||||
} else {
|
||||
subject = fmt.Sprintf("%s %s below threshold", systemName, titleAlertName)
|
||||
}
|
||||
minutesLabel := "minute"
|
||||
if alert.min > 1 {
|
||||
minutesLabel += "s"
|
||||
}
|
||||
if alert.descriptor == "" {
|
||||
alert.descriptor = alert.name
|
||||
}
|
||||
body := fmt.Sprintf("%s averaged %.2f%s for the previous %v %s.", alert.descriptor, alert.val, alert.unit, alert.min, minutesLabel)
|
||||
|
||||
alert.alertRecord.Set("triggered", alert.triggered)
|
||||
if err := am.hub.Save(alert.alertRecord); err != nil {
|
||||
// app.Logger().Error("failed to save alert record", "err", err)
|
||||
return
|
||||
}
|
||||
am.SendAlert(AlertMessageData{
|
||||
UserID: alert.alertRecord.GetString("user"),
|
||||
Title: subject,
|
||||
Message: body,
|
||||
Link: am.hub.MakeLink("system", systemName),
|
||||
LinkText: "View " + systemName,
|
||||
})
|
||||
}
|
Reference in New Issue
Block a user