change disk alert to monitor usage of any disk, not only root

This commit is contained in:
Henry Dollman
2024-10-16 17:21:05 -04:00
parent 59d541dd1d
commit 6b6d3fabc0
3 changed files with 65 additions and 32 deletions

View File

@@ -4,7 +4,6 @@ package alerts
import ( import (
"beszel/internal/entities/system" "beszel/internal/entities/system"
"fmt" "fmt"
"log"
"net/mail" "net/mail"
"net/url" "net/url"
"time" "time"
@@ -58,7 +57,7 @@ type SystemAlertData struct {
time time.Time time time.Time
count uint8 count uint8
min uint8 min uint8
tempSums map[string]float32 mapSums map[string]float32
descriptor string // override descriptor in notification body (for temp sensor, disk partition, etc) descriptor string // override descriptor in notification body (for temp sensor, disk partition, etc)
} }
@@ -68,18 +67,17 @@ func NewAlertManager(app *pocketbase.PocketBase) *AlertManager {
} }
} }
func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemInfo system.Info, temperatures map[string]float64) { func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemInfo system.Info, temperatures map[string]float64, extraFs map[string]*system.FsStats) error {
// start := time.Now() // start := time.Now()
// defer func() { // defer func() {
// log.Println("alert stats took", time.Since(start)) // log.Println("alert stats took", time.Since(start))
// }() // }()
alertRecords, err := am.app.Dao().FindRecordsByExpr("alerts", alertRecords, err := am.app.Dao().FindRecordsByExpr("alerts",
dbx.NewExp("system={:system}", dbx.Params{"system": systemRecord.GetId()}), dbx.NewExp("system={:system}", dbx.Params{"system": systemRecord.Id}),
) )
if err != nil || len(alertRecords) == 0 { if err != nil || len(alertRecords) == 0 {
// log.Println("no alerts found for system") // log.Println("no alerts found for system")
return return nil
} }
var validAlerts []SystemAlertData var validAlerts []SystemAlertData
@@ -96,11 +94,18 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
val = systemInfo.Cpu val = systemInfo.Cpu
case "Memory": case "Memory":
val = systemInfo.MemPct val = systemInfo.MemPct
case "Disk":
val = systemInfo.DiskPct
case "Bandwidth": case "Bandwidth":
val = systemInfo.Bandwidth val = systemInfo.Bandwidth
unit = "MB/s" unit = " MB/s"
case "Disk":
maxUsedPct := systemInfo.DiskPct
for _, fs := range extraFs {
usedPct := fs.DiskUsed / fs.DiskTotal * 100
if usedPct > maxUsedPct {
maxUsedPct = usedPct
}
}
val = maxUsedPct
case "Temperature": case "Temperature":
if temperatures == nil { if temperatures == nil {
continue continue
@@ -126,7 +131,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
min := max(1, cast.ToUint8(alertRecord.Get("min"))) min := max(1, cast.ToUint8(alertRecord.Get("min")))
// add time to alert time to make sure it's slighty after record creation // add time to alert time to make sure it's slighty after record creation
time := now.Add(-time.Duration(min)*time.Minute + time.Second*5) time := now.Add(-time.Duration(min) * time.Minute)
if time.Before(oldestTime) { if time.Before(oldestTime) {
oldestTime = time oldestTime = time
} }
@@ -164,7 +169,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
All(&systemStats) All(&systemStats)
if err != nil { if err != nil {
return return err
} }
// get oldest record creation time from first record in the slice // get oldest record creation time from first record in the slice
@@ -181,7 +186,7 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
if len(validAlerts) == 0 { if len(validAlerts) == 0 {
// log.Println("no valid alerts found") // log.Println("no valid alerts found")
return return nil
} }
var stats SystemAlertStats var stats SystemAlertStats
@@ -189,9 +194,11 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
// we can skip the latest systemStats record since it's the current value // we can skip the latest systemStats record since it's the current value
for i := 0; i < len(systemStats); i++ { for i := 0; i < len(systemStats); i++ {
stat := systemStats[i] stat := systemStats[i]
// log.Println("created", stat.Created.Time(), "now", time.Now().UTC()) // subtract 10 seconds to give a small time buffer
statTime := stat.Created.Time().Add(time.Second) systemStatsCreation := stat.Created.Time().Add(-time.Second * 10)
json.Unmarshal(stat.Stats, &stats) if err := json.Unmarshal(stat.Stats, &stats); err != nil {
return err
}
// log.Println("stats", stats) // log.Println("stats", stats)
for j := range validAlerts { for j := range validAlerts {
alert := &validAlerts[j] alert := &validAlerts[j]
@@ -199,8 +206,8 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
if i == 0 { if i == 0 {
alert.val = 0 alert.val = 0
} }
// continue if stat is older than alert time range // continue if system_stats is older than alert time range
if statTime.Before(alert.time) { if systemStatsCreation.Before(alert.time) {
continue continue
} }
// add to alert value // add to alert value
@@ -212,17 +219,30 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
case "Bandwidth": case "Bandwidth":
alert.val += stats.NetSent + stats.NetRecv alert.val += stats.NetSent + stats.NetRecv
case "Disk": case "Disk":
// todo: check all disks instead of just root if alert.mapSums == nil {
alert.val += stats.Disk alert.mapSums = make(map[string]float32, len(extraFs)+1)
case "Temperature":
if alert.tempSums == nil {
alert.tempSums = make(map[string]float32, len(stats.Temperatures))
} }
for key, value := range stats.Temperatures { // add root disk
if _, ok := alert.tempSums[key]; !ok { if _, ok := alert.mapSums["root"]; !ok {
alert.tempSums[key] = float32(0) alert.mapSums["root"] = 0.0
}
alert.mapSums["root"] += float32(stats.Disk)
// add extra disks
for key, fs := range extraFs {
if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = 0.0
} }
alert.tempSums[key] += value alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
}
case "Temperature":
if alert.mapSums == nil {
alert.mapSums = make(map[string]float32, len(stats.Temperatures))
}
for key, temp := range stats.Temperatures {
if _, ok := alert.mapSums[key]; !ok {
alert.mapSums[key] = float32(0)
}
alert.mapSums[key] += temp
} }
default: default:
continue continue
@@ -233,13 +253,23 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
// sum up vals for each alert // sum up vals for each alert
for _, alert := range validAlerts { for _, alert := range validAlerts {
switch alert.name { switch alert.name {
case "Disk":
maxPct := float32(0)
for key, value := range alert.mapSums {
sumPct := float32(value)
if sumPct > maxPct {
maxPct = sumPct
alert.descriptor = fmt.Sprintf("Usage of %s", key)
}
}
alert.val = float64(maxPct / float32(alert.count))
case "Temperature": case "Temperature":
maxTemp := float32(0) maxTemp := float32(0)
for key, value := range alert.tempSums { for key, value := range alert.mapSums {
sumTemp := float32(value) / float32(alert.count) sumTemp := float32(value) / float32(alert.count)
if sumTemp > maxTemp { if sumTemp > maxTemp {
maxTemp = sumTemp maxTemp = sumTemp
alert.descriptor = fmt.Sprintf("Hottest sensor %s", key) alert.descriptor = fmt.Sprintf("Highest sensor %s", key)
} }
} }
alert.val = float64(maxTemp) alert.val = float64(maxTemp)
@@ -260,10 +290,11 @@ func (am *AlertManager) HandleSystemAlerts(systemRecord *models.Record, systemIn
} }
} }
} }
return nil
} }
func (am *AlertManager) sendSystemAlert(alert SystemAlertData) { func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold) // log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold)
systemName := alert.systemRecord.GetString("name") systemName := alert.systemRecord.GetString("name")

View File

@@ -315,7 +315,9 @@ func (h *Hub) updateSystem(record *models.Record) {
} }
} }
// system info alerts (todo: extra fs alerts) // system info alerts (todo: extra fs alerts)
h.am.HandleSystemAlerts(record, systemData.Info, systemData.Stats.Temperatures) if err := h.am.HandleSystemAlerts(record, systemData.Info, systemData.Stats.Temperatures, systemData.Stats.ExtraFs); err != nil {
h.app.Logger().Error("System alerts error", "err", err.Error())
}
} }
// set system to specified status and save record // set system to specified status and save record

View File

@@ -54,7 +54,7 @@ export const updateSystemList = async () => {
export const updateAlerts = () => { export const updateAlerts = () => {
pb.collection('alerts') pb.collection('alerts')
.getFullList<AlertRecord>({ fields: 'id,name,system,value,min,triggered' }) .getFullList<AlertRecord>({ fields: 'id,name,system,value,min,triggered', sort: 'updated' })
.then((records) => { .then((records) => {
$alerts.set(records) $alerts.set(records)
}) })
@@ -315,7 +315,7 @@ export const alertInfo = {
name: 'Disk usage', name: 'Disk usage',
unit: '%', unit: '%',
icon: HardDriveIcon, icon: HardDriveIcon,
desc: 'Triggers when root usage exceeds a threshold.', desc: 'Triggers when usage of any disk exceeds a threshold.',
}, },
Bandwidth: { Bandwidth: {
name: 'Bandwidth', name: 'Bandwidth',