mirror of
https://github.com/fankes/beszel.git
synced 2025-10-19 17:59:28 +08:00
- splits alerts package into three files. status alerts were not modified aside from updating to slices.Delete method
This commit is contained in:
@@ -2,25 +2,24 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"beszel/internal/entities/system"
|
||||
"fmt"
|
||||
"net/mail"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/containrrr/shoutrrr"
|
||||
"github.com/goccy/go-json"
|
||||
"github.com/pocketbase/dbx"
|
||||
"github.com/pocketbase/pocketbase/apis"
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
"github.com/pocketbase/pocketbase/tools/mailer"
|
||||
"github.com/pocketbase/pocketbase/tools/types"
|
||||
"github.com/spf13/cast"
|
||||
)
|
||||
|
||||
type AlertManager struct {
|
||||
app core.App
|
||||
app core.App
|
||||
alertQueue chan alertTask
|
||||
stopChan chan struct{}
|
||||
pendingAlerts sync.Map
|
||||
}
|
||||
|
||||
type AlertMessageData struct {
|
||||
@@ -60,350 +59,43 @@ type SystemAlertData struct {
|
||||
descriptor string // override descriptor in notification body (for temp sensor, disk partition, etc)
|
||||
}
|
||||
|
||||
// notification services that support title param
|
||||
var supportsTitle = map[string]struct{}{
|
||||
"bark": {},
|
||||
"discord": {},
|
||||
"gotify": {},
|
||||
"ifttt": {},
|
||||
"join": {},
|
||||
"matrix": {},
|
||||
"ntfy": {},
|
||||
"opsgenie": {},
|
||||
"pushbullet": {},
|
||||
"pushover": {},
|
||||
"slack": {},
|
||||
"teams": {},
|
||||
"telegram": {},
|
||||
"zulip": {},
|
||||
}
|
||||
|
||||
// NewAlertManager creates a new AlertManager instance.
|
||||
func NewAlertManager(app core.App) *AlertManager {
|
||||
return &AlertManager{
|
||||
app: app,
|
||||
am := &AlertManager{
|
||||
app: app,
|
||||
alertQueue: make(chan alertTask),
|
||||
stopChan: make(chan struct{}),
|
||||
}
|
||||
go am.startWorker()
|
||||
return am
|
||||
}
|
||||
|
||||
func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, systemInfo system.Info, temperatures map[string]float64, extraFs map[string]*system.FsStats) error {
|
||||
// start := time.Now()
|
||||
// defer func() {
|
||||
// log.Println("alert stats took", time.Since(start))
|
||||
// }()
|
||||
alertRecords, err := am.app.FindAllRecords("alerts",
|
||||
dbx.NewExp("system={:system}", dbx.Params{"system": systemRecord.Id}),
|
||||
)
|
||||
if err != nil || len(alertRecords) == 0 {
|
||||
// log.Println("no alerts found for system")
|
||||
return nil
|
||||
}
|
||||
|
||||
var validAlerts []SystemAlertData
|
||||
now := systemRecord.GetDateTime("updated").Time().UTC()
|
||||
oldestTime := now
|
||||
|
||||
for _, alertRecord := range alertRecords {
|
||||
name := alertRecord.GetString("name")
|
||||
var val float64
|
||||
unit := "%"
|
||||
|
||||
switch name {
|
||||
case "CPU":
|
||||
val = systemInfo.Cpu
|
||||
case "Memory":
|
||||
val = systemInfo.MemPct
|
||||
case "Bandwidth":
|
||||
val = systemInfo.Bandwidth
|
||||
unit = " MB/s"
|
||||
case "Disk":
|
||||
maxUsedPct := systemInfo.DiskPct
|
||||
for _, fs := range extraFs {
|
||||
usedPct := fs.DiskUsed / fs.DiskTotal * 100
|
||||
if usedPct > maxUsedPct {
|
||||
maxUsedPct = usedPct
|
||||
}
|
||||
}
|
||||
val = maxUsedPct
|
||||
case "Temperature":
|
||||
if temperatures == nil {
|
||||
continue
|
||||
}
|
||||
for _, temp := range temperatures {
|
||||
if temp > val {
|
||||
val = temp
|
||||
}
|
||||
}
|
||||
unit = "°C"
|
||||
}
|
||||
|
||||
triggered := alertRecord.GetBool("triggered")
|
||||
threshold := alertRecord.GetFloat("value")
|
||||
|
||||
// CONTINUE
|
||||
// IF alert is not triggered and curValue is less than threshold
|
||||
// OR alert is triggered and curValue is greater than threshold
|
||||
if (!triggered && val <= threshold) || (triggered && val > threshold) {
|
||||
// log.Printf("Skipping alert %s: val %f | threshold %f | triggered %v\n", name, val, threshold, triggered)
|
||||
continue
|
||||
}
|
||||
|
||||
min := max(1, cast.ToUint8(alertRecord.Get("min")))
|
||||
// add time to alert time to make sure it's slighty after record creation
|
||||
time := now.Add(-time.Duration(min) * time.Minute)
|
||||
if time.Before(oldestTime) {
|
||||
oldestTime = time
|
||||
}
|
||||
|
||||
validAlerts = append(validAlerts, SystemAlertData{
|
||||
systemRecord: systemRecord,
|
||||
alertRecord: alertRecord,
|
||||
name: name,
|
||||
unit: unit,
|
||||
val: val,
|
||||
threshold: threshold,
|
||||
triggered: triggered,
|
||||
time: time,
|
||||
min: min,
|
||||
})
|
||||
}
|
||||
|
||||
systemStats := []struct {
|
||||
Stats []byte `db:"stats"`
|
||||
Created types.DateTime `db:"created"`
|
||||
}{}
|
||||
|
||||
err = am.app.DB().
|
||||
Select("stats", "created").
|
||||
From("system_stats").
|
||||
Where(dbx.NewExp(
|
||||
"system={:system} AND type='1m' AND created > {:created}",
|
||||
dbx.Params{
|
||||
"system": systemRecord.Id,
|
||||
// subtract some time to give us a bit of buffer
|
||||
"created": oldestTime.Add(-time.Second * 90),
|
||||
},
|
||||
)).
|
||||
OrderBy("created").
|
||||
All(&systemStats)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// get oldest record creation time from first record in the slice
|
||||
oldestRecordTime := systemStats[0].Created.Time()
|
||||
// log.Println("oldestRecordTime", oldestRecordTime.String())
|
||||
|
||||
// delete from validAlerts if time is older than oldestRecord
|
||||
for i := 0; i < len(validAlerts); i++ {
|
||||
if validAlerts[i].time.Before(oldestRecordTime) {
|
||||
// log.Println("deleting alert - time is older than oldestRecord", validAlerts[i].name, oldestRecordTime, validAlerts[i].time)
|
||||
validAlerts = append(validAlerts[:i], validAlerts[i+1:]...)
|
||||
}
|
||||
}
|
||||
|
||||
if len(validAlerts) == 0 {
|
||||
// log.Println("no valid alerts found")
|
||||
return nil
|
||||
}
|
||||
|
||||
var stats SystemAlertStats
|
||||
|
||||
// we can skip the latest systemStats record since it's the current value
|
||||
for i := 0; i < len(systemStats); i++ {
|
||||
stat := systemStats[i]
|
||||
// subtract 10 seconds to give a small time buffer
|
||||
systemStatsCreation := stat.Created.Time().Add(-time.Second * 10)
|
||||
if err := json.Unmarshal(stat.Stats, &stats); err != nil {
|
||||
return err
|
||||
}
|
||||
// log.Println("stats", stats)
|
||||
for j := range validAlerts {
|
||||
alert := &validAlerts[j]
|
||||
// reset alert val on first iteration
|
||||
if i == 0 {
|
||||
alert.val = 0
|
||||
}
|
||||
// continue if system_stats is older than alert time range
|
||||
if systemStatsCreation.Before(alert.time) {
|
||||
continue
|
||||
}
|
||||
// add to alert value
|
||||
switch alert.name {
|
||||
case "CPU":
|
||||
alert.val += stats.Cpu
|
||||
case "Memory":
|
||||
alert.val += stats.Mem
|
||||
case "Bandwidth":
|
||||
alert.val += stats.NetSent + stats.NetRecv
|
||||
case "Disk":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(extraFs)+1)
|
||||
}
|
||||
// add root disk
|
||||
if _, ok := alert.mapSums["root"]; !ok {
|
||||
alert.mapSums["root"] = 0.0
|
||||
}
|
||||
alert.mapSums["root"] += float32(stats.Disk)
|
||||
// add extra disks
|
||||
for key, fs := range extraFs {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = 0.0
|
||||
}
|
||||
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
|
||||
}
|
||||
case "Temperature":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(stats.Temperatures))
|
||||
}
|
||||
for key, temp := range stats.Temperatures {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = float32(0)
|
||||
}
|
||||
alert.mapSums[key] += temp
|
||||
}
|
||||
default:
|
||||
continue
|
||||
}
|
||||
alert.count++
|
||||
}
|
||||
}
|
||||
// sum up vals for each alert
|
||||
for _, alert := range validAlerts {
|
||||
switch alert.name {
|
||||
case "Disk":
|
||||
maxPct := float32(0)
|
||||
for key, value := range alert.mapSums {
|
||||
sumPct := float32(value)
|
||||
if sumPct > maxPct {
|
||||
maxPct = sumPct
|
||||
alert.descriptor = fmt.Sprintf("Usage of %s", key)
|
||||
}
|
||||
}
|
||||
alert.val = float64(maxPct / float32(alert.count))
|
||||
case "Temperature":
|
||||
maxTemp := float32(0)
|
||||
for key, value := range alert.mapSums {
|
||||
sumTemp := float32(value) / float32(alert.count)
|
||||
if sumTemp > maxTemp {
|
||||
maxTemp = sumTemp
|
||||
alert.descriptor = fmt.Sprintf("Highest sensor %s", key)
|
||||
}
|
||||
}
|
||||
alert.val = float64(maxTemp)
|
||||
default:
|
||||
alert.val = alert.val / float64(alert.count)
|
||||
}
|
||||
minCount := float32(alert.min) / 1.2
|
||||
// log.Println("alert", alert.name, "val", alert.val, "threshold", alert.threshold, "triggered", alert.triggered)
|
||||
// log.Printf("%s: val %f | count %d | min-count %f | threshold %f\n", alert.name, alert.val, alert.count, minCount, alert.threshold)
|
||||
// pass through alert if count is greater than or equal to minCount
|
||||
if float32(alert.count) >= minCount {
|
||||
if !alert.triggered && alert.val > alert.threshold {
|
||||
alert.triggered = true
|
||||
go am.sendSystemAlert(alert)
|
||||
} else if alert.triggered && alert.val <= alert.threshold {
|
||||
alert.triggered = false
|
||||
go am.sendSystemAlert(alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
|
||||
// log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold)
|
||||
systemName := alert.systemRecord.GetString("name")
|
||||
|
||||
// change Disk to Disk usage
|
||||
if alert.name == "Disk" {
|
||||
alert.name += " usage"
|
||||
}
|
||||
|
||||
// make title alert name lowercase if not CPU
|
||||
titleAlertName := alert.name
|
||||
if titleAlertName != "CPU" {
|
||||
titleAlertName = strings.ToLower(titleAlertName)
|
||||
}
|
||||
|
||||
var subject string
|
||||
if alert.triggered {
|
||||
subject = fmt.Sprintf("%s %s above threshold", systemName, titleAlertName)
|
||||
} else {
|
||||
subject = fmt.Sprintf("%s %s below threshold", systemName, titleAlertName)
|
||||
}
|
||||
minutesLabel := "minute"
|
||||
if alert.min > 1 {
|
||||
minutesLabel += "s"
|
||||
}
|
||||
if alert.descriptor == "" {
|
||||
alert.descriptor = alert.name
|
||||
}
|
||||
body := fmt.Sprintf("%s averaged %.2f%s for the previous %v %s.", alert.descriptor, alert.val, alert.unit, alert.min, minutesLabel)
|
||||
|
||||
alert.alertRecord.Set("triggered", alert.triggered)
|
||||
if err := am.app.Save(alert.alertRecord); err != nil {
|
||||
// app.Logger().Error("failed to save alert record", "err", err.Error())
|
||||
return
|
||||
}
|
||||
// expand the user relation and send the alert
|
||||
if errs := am.app.ExpandRecord(alert.alertRecord, []string{"user"}, nil); len(errs) > 0 {
|
||||
// app.Logger().Error("failed to expand user relation", "errs", errs)
|
||||
return
|
||||
}
|
||||
if user := alert.alertRecord.ExpandedOne("user"); user != nil {
|
||||
am.sendAlert(AlertMessageData{
|
||||
UserID: user.Id,
|
||||
Title: subject,
|
||||
Message: body,
|
||||
Link: am.app.Settings().Meta.AppURL + "/system/" + url.PathEscape(systemName),
|
||||
LinkText: "View " + systemName,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// todo: allow x minutes downtime before sending alert
|
||||
func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *core.Record) error {
|
||||
var alertStatus string
|
||||
switch newStatus {
|
||||
case "up":
|
||||
if oldSystemRecord.GetString("status") == "down" {
|
||||
alertStatus = "up"
|
||||
}
|
||||
case "down":
|
||||
if oldSystemRecord.GetString("status") == "up" {
|
||||
alertStatus = "down"
|
||||
}
|
||||
}
|
||||
if alertStatus == "" {
|
||||
return nil
|
||||
}
|
||||
// check if use
|
||||
alertRecords, err := am.app.FindAllRecords("alerts",
|
||||
dbx.HashExp{
|
||||
"system": oldSystemRecord.Id,
|
||||
"name": "Status",
|
||||
},
|
||||
)
|
||||
if err != nil || len(alertRecords) == 0 {
|
||||
// log.Println("no alerts found for system")
|
||||
return nil
|
||||
}
|
||||
for _, alertRecord := range alertRecords {
|
||||
// expand the user relation
|
||||
if errs := am.app.ExpandRecord(alertRecord, []string{"user"}, nil); len(errs) > 0 {
|
||||
return fmt.Errorf("failed to expand: %v", errs)
|
||||
}
|
||||
user := alertRecord.ExpandedOne("user")
|
||||
if user == nil {
|
||||
return nil
|
||||
}
|
||||
emoji := "\U0001F534"
|
||||
if alertStatus == "up" {
|
||||
emoji = "\u2705"
|
||||
}
|
||||
// send alert
|
||||
systemName := oldSystemRecord.GetString("name")
|
||||
am.sendAlert(AlertMessageData{
|
||||
UserID: user.Id,
|
||||
Title: fmt.Sprintf("Connection to %s is %s %v", systemName, alertStatus, emoji),
|
||||
Message: fmt.Sprintf("Connection to %s is %s", systemName, alertStatus),
|
||||
Link: am.app.Settings().Meta.AppURL + "/system/" + url.PathEscape(systemName),
|
||||
LinkText: "View " + systemName,
|
||||
})
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (am *AlertManager) sendAlert(data AlertMessageData) {
|
||||
func (am *AlertManager) SendAlert(data AlertMessageData) error {
|
||||
// get user settings
|
||||
record, err := am.app.FindFirstRecordByFilter(
|
||||
"user_settings", "user={:user}",
|
||||
dbx.Params{"user": data.UserID},
|
||||
)
|
||||
if err != nil {
|
||||
am.app.Logger().Error("Failed to get user settings", "err", err.Error())
|
||||
return
|
||||
return err
|
||||
}
|
||||
// unmarshal user settings
|
||||
userAlertSettings := UserNotificationSettings{
|
||||
@@ -421,8 +113,7 @@ func (am *AlertManager) sendAlert(data AlertMessageData) {
|
||||
}
|
||||
// send alerts via email
|
||||
if len(userAlertSettings.Emails) == 0 {
|
||||
// log.Println("No email addresses found")
|
||||
return
|
||||
return nil
|
||||
}
|
||||
addresses := []mail.Address{}
|
||||
for _, email := range userAlertSettings.Emails {
|
||||
@@ -437,18 +128,16 @@ func (am *AlertManager) sendAlert(data AlertMessageData) {
|
||||
Name: am.app.Settings().Meta.SenderName,
|
||||
},
|
||||
}
|
||||
if err := am.app.NewMailClient().Send(&message); err != nil {
|
||||
am.app.Logger().Error("Failed to send alert: ", "err", err.Error())
|
||||
} else {
|
||||
am.app.Logger().Info("Sent email alert", "to", message.To, "subj", message.Subject)
|
||||
err = am.app.NewMailClient().Send(&message)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
am.app.Logger().Info("Sent email alert", "to", message.To, "subj", message.Subject)
|
||||
return nil
|
||||
}
|
||||
|
||||
// SendShoutrrrAlert sends an alert via a Shoutrrr URL
|
||||
func (am *AlertManager) SendShoutrrrAlert(notificationUrl, title, message, link, linkText string) error {
|
||||
// services that support title param
|
||||
supportsTitle := []string{"bark", "discord", "gotify", "ifttt", "join", "matrix", "ntfy", "opsgenie", "pushbullet", "pushover", "slack", "teams", "telegram", "zulip"}
|
||||
|
||||
// Parse the URL
|
||||
parsedURL, err := url.Parse(notificationUrl)
|
||||
if err != nil {
|
||||
@@ -458,7 +147,7 @@ func (am *AlertManager) SendShoutrrrAlert(notificationUrl, title, message, link,
|
||||
queryParams := parsedURL.Query()
|
||||
|
||||
// Add title
|
||||
if sliceContains(supportsTitle, scheme) {
|
||||
if _, ok := supportsTitle[scheme]; ok {
|
||||
queryParams.Add("title", title)
|
||||
} else if scheme == "mattermost" {
|
||||
// use markdown title for mattermost
|
||||
@@ -499,16 +188,6 @@ func (am *AlertManager) SendShoutrrrAlert(notificationUrl, title, message, link,
|
||||
return nil
|
||||
}
|
||||
|
||||
// Contains checks if a string is present in a slice of strings
|
||||
func sliceContains(slice []string, item string) bool {
|
||||
for _, v := range slice {
|
||||
if v == item {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (am *AlertManager) SendTestNotification(e *core.RequestEvent) error {
|
||||
info, _ := e.RequestInfo()
|
||||
if info.Auth == nil {
|
||||
|
175
beszel/internal/alerts/alerts_status.go
Normal file
175
beszel/internal/alerts/alerts_status.go
Normal file
@@ -0,0 +1,175 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/pocketbase/dbx"
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
)
|
||||
|
||||
type alertTask struct {
|
||||
action string // "schedule" or "cancel"
|
||||
systemName string
|
||||
alertRecord *core.Record
|
||||
delay time.Duration
|
||||
}
|
||||
|
||||
type alertInfo struct {
|
||||
systemName string
|
||||
alertRecord *core.Record
|
||||
expireTime time.Time
|
||||
}
|
||||
|
||||
// startWorker is a long-running goroutine that processes alert tasks
|
||||
// every x seconds. It must be running to process status alerts.
|
||||
func (am *AlertManager) startWorker() {
|
||||
// no special reason for 13 seconds
|
||||
tick := time.Tick(13 * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-am.stopChan:
|
||||
return
|
||||
case task := <-am.alertQueue:
|
||||
switch task.action {
|
||||
case "schedule":
|
||||
am.pendingAlerts.Store(task.alertRecord.Id, &alertInfo{
|
||||
systemName: task.systemName,
|
||||
alertRecord: task.alertRecord,
|
||||
expireTime: time.Now().Add(task.delay),
|
||||
})
|
||||
case "cancel":
|
||||
am.pendingAlerts.Delete(task.alertRecord.Id)
|
||||
}
|
||||
case <-tick:
|
||||
// Check for expired alerts every tick
|
||||
now := time.Now()
|
||||
for key, value := range am.pendingAlerts.Range {
|
||||
info := value.(*alertInfo)
|
||||
if now.After(info.expireTime) {
|
||||
// Downtime delay has passed, process alert
|
||||
am.sendStatusAlert("down", info.systemName, info.alertRecord)
|
||||
am.pendingAlerts.Delete(key)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// StopWorker shuts down the AlertManager.worker goroutine
|
||||
func (am *AlertManager) StopWorker() {
|
||||
close(am.stopChan)
|
||||
}
|
||||
|
||||
// HandleStatusAlerts manages the logic when system status changes.
|
||||
func (am *AlertManager) HandleStatusAlerts(newStatus string, oldSystemRecord *core.Record) error {
|
||||
switch newStatus {
|
||||
case "up":
|
||||
if oldSystemRecord.GetString("status") != "down" {
|
||||
return nil
|
||||
}
|
||||
case "down":
|
||||
if oldSystemRecord.GetString("status") != "up" {
|
||||
return nil
|
||||
}
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
alertRecords, err := am.getSystemStatusAlerts(oldSystemRecord.Id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(alertRecords) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
systemName := oldSystemRecord.GetString("name")
|
||||
if newStatus == "down" {
|
||||
am.handleSystemDown(systemName, alertRecords)
|
||||
} else {
|
||||
am.handleSystemUp(systemName, alertRecords)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getSystemStatusAlerts retrieves all "Status" alert records for a given system ID.
|
||||
func (am *AlertManager) getSystemStatusAlerts(systemID string) ([]*core.Record, error) {
|
||||
alertRecords, err := am.app.FindAllRecords("alerts", dbx.HashExp{
|
||||
"system": systemID,
|
||||
"name": "Status",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return alertRecords, nil
|
||||
}
|
||||
|
||||
// Schedules delayed "down" alerts for each alert record.
|
||||
func (am *AlertManager) handleSystemDown(systemName string, alertRecords []*core.Record) {
|
||||
for _, alertRecord := range alertRecords {
|
||||
// Continue if alert is already scheduled
|
||||
if _, exists := am.pendingAlerts.Load(alertRecord.Id); exists {
|
||||
continue
|
||||
}
|
||||
// Schedule by adding to queue
|
||||
min := max(1, alertRecord.GetInt("min"))
|
||||
am.alertQueue <- alertTask{
|
||||
action: "schedule",
|
||||
systemName: systemName,
|
||||
alertRecord: alertRecord,
|
||||
delay: time.Duration(min) * time.Minute,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleSystemUp manages the logic when a system status changes to "up".
|
||||
// It cancels any pending alerts and sends "up" alerts.
|
||||
func (am *AlertManager) handleSystemUp(systemName string, alertRecords []*core.Record) {
|
||||
for _, alertRecord := range alertRecords {
|
||||
alertRecordID := alertRecord.Id
|
||||
// If alert exists for record, delete and continue (down alert not sent)
|
||||
if _, exists := am.pendingAlerts.Load(alertRecordID); exists {
|
||||
am.alertQueue <- alertTask{
|
||||
action: "cancel",
|
||||
alertRecord: alertRecord,
|
||||
}
|
||||
continue
|
||||
}
|
||||
// No alert scheduled for this record, send "up" alert
|
||||
if err := am.sendStatusAlert("up", systemName, alertRecord); err != nil {
|
||||
am.app.Logger().Error("Failed to send alert", "err", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sendStatusAlert sends a status alert ("up" or "down") to the users associated with the alert records.
|
||||
func (am *AlertManager) sendStatusAlert(alertStatus string, systemName string, alertRecord *core.Record) error {
|
||||
var emoji string
|
||||
if alertStatus == "up" {
|
||||
emoji = "\u2705" // Green checkmark emoji
|
||||
} else {
|
||||
emoji = "\U0001F534" // Red alert emoji
|
||||
}
|
||||
|
||||
title := fmt.Sprintf("Connection to %s is %s %v", systemName, alertStatus, emoji)
|
||||
message := strings.TrimSuffix(title, emoji)
|
||||
|
||||
if errs := am.app.ExpandRecord(alertRecord, []string{"user"}, nil); len(errs) > 0 {
|
||||
return errs["user"]
|
||||
}
|
||||
user := alertRecord.ExpandedOne("user")
|
||||
if user == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return am.SendAlert(AlertMessageData{
|
||||
UserID: user.Id,
|
||||
Title: title,
|
||||
Message: message,
|
||||
Link: am.app.Settings().Meta.AppURL + "/system/" + url.PathEscape(systemName),
|
||||
LinkText: "View " + systemName,
|
||||
})
|
||||
}
|
288
beszel/internal/alerts/alerts_system.go
Normal file
288
beszel/internal/alerts/alerts_system.go
Normal file
@@ -0,0 +1,288 @@
|
||||
package alerts
|
||||
|
||||
import (
|
||||
"beszel/internal/entities/system"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/goccy/go-json"
|
||||
"github.com/pocketbase/dbx"
|
||||
"github.com/pocketbase/pocketbase/core"
|
||||
"github.com/pocketbase/pocketbase/tools/types"
|
||||
"github.com/spf13/cast"
|
||||
)
|
||||
|
||||
func (am *AlertManager) HandleSystemAlerts(systemRecord *core.Record, systemInfo system.Info, temperatures map[string]float64, extraFs map[string]*system.FsStats) error {
|
||||
alertRecords, err := am.app.FindAllRecords("alerts",
|
||||
dbx.NewExp("system={:system}", dbx.Params{"system": systemRecord.Id}),
|
||||
)
|
||||
if err != nil || len(alertRecords) == 0 {
|
||||
// log.Println("no alerts found for system")
|
||||
return nil
|
||||
}
|
||||
|
||||
var validAlerts []SystemAlertData
|
||||
now := systemRecord.GetDateTime("updated").Time().UTC()
|
||||
oldestTime := now
|
||||
|
||||
for _, alertRecord := range alertRecords {
|
||||
name := alertRecord.GetString("name")
|
||||
var val float64
|
||||
unit := "%"
|
||||
|
||||
switch name {
|
||||
case "CPU":
|
||||
val = systemInfo.Cpu
|
||||
case "Memory":
|
||||
val = systemInfo.MemPct
|
||||
case "Bandwidth":
|
||||
val = systemInfo.Bandwidth
|
||||
unit = " MB/s"
|
||||
case "Disk":
|
||||
maxUsedPct := systemInfo.DiskPct
|
||||
for _, fs := range extraFs {
|
||||
usedPct := fs.DiskUsed / fs.DiskTotal * 100
|
||||
if usedPct > maxUsedPct {
|
||||
maxUsedPct = usedPct
|
||||
}
|
||||
}
|
||||
val = maxUsedPct
|
||||
case "Temperature":
|
||||
if temperatures == nil {
|
||||
continue
|
||||
}
|
||||
for _, temp := range temperatures {
|
||||
if temp > val {
|
||||
val = temp
|
||||
}
|
||||
}
|
||||
unit = "°C"
|
||||
}
|
||||
|
||||
triggered := alertRecord.GetBool("triggered")
|
||||
threshold := alertRecord.GetFloat("value")
|
||||
|
||||
// CONTINUE
|
||||
// IF alert is not triggered and curValue is less than threshold
|
||||
// OR alert is triggered and curValue is greater than threshold
|
||||
if (!triggered && val <= threshold) || (triggered && val > threshold) {
|
||||
// log.Printf("Skipping alert %s: val %f | threshold %f | triggered %v\n", name, val, threshold, triggered)
|
||||
continue
|
||||
}
|
||||
|
||||
min := max(1, cast.ToUint8(alertRecord.Get("min")))
|
||||
// add time to alert time to make sure it's slighty after record creation
|
||||
time := now.Add(-time.Duration(min) * time.Minute)
|
||||
if time.Before(oldestTime) {
|
||||
oldestTime = time
|
||||
}
|
||||
|
||||
validAlerts = append(validAlerts, SystemAlertData{
|
||||
systemRecord: systemRecord,
|
||||
alertRecord: alertRecord,
|
||||
name: name,
|
||||
unit: unit,
|
||||
val: val,
|
||||
threshold: threshold,
|
||||
triggered: triggered,
|
||||
time: time,
|
||||
min: min,
|
||||
})
|
||||
}
|
||||
|
||||
systemStats := []struct {
|
||||
Stats []byte `db:"stats"`
|
||||
Created types.DateTime `db:"created"`
|
||||
}{}
|
||||
|
||||
err = am.app.DB().
|
||||
Select("stats", "created").
|
||||
From("system_stats").
|
||||
Where(dbx.NewExp(
|
||||
"system={:system} AND type='1m' AND created > {:created}",
|
||||
dbx.Params{
|
||||
"system": systemRecord.Id,
|
||||
// subtract some time to give us a bit of buffer
|
||||
"created": oldestTime.Add(-time.Second * 90),
|
||||
},
|
||||
)).
|
||||
OrderBy("created").
|
||||
All(&systemStats)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// get oldest record creation time from first record in the slice
|
||||
oldestRecordTime := systemStats[0].Created.Time()
|
||||
// log.Println("oldestRecordTime", oldestRecordTime.String())
|
||||
|
||||
// delete from validAlerts if time is older than oldestRecord
|
||||
for i := range validAlerts {
|
||||
if validAlerts[i].time.Before(oldestRecordTime) {
|
||||
// log.Println("deleting alert - time is older than oldestRecord", validAlerts[i].name, oldestRecordTime, validAlerts[i].time)
|
||||
validAlerts = slices.Delete(validAlerts, i, i+1)
|
||||
}
|
||||
}
|
||||
|
||||
if len(validAlerts) == 0 {
|
||||
// log.Println("no valid alerts found")
|
||||
return nil
|
||||
}
|
||||
|
||||
var stats SystemAlertStats
|
||||
|
||||
// we can skip the latest systemStats record since it's the current value
|
||||
for i := range systemStats {
|
||||
stat := systemStats[i]
|
||||
// subtract 10 seconds to give a small time buffer
|
||||
systemStatsCreation := stat.Created.Time().Add(-time.Second * 10)
|
||||
if err := json.Unmarshal(stat.Stats, &stats); err != nil {
|
||||
return err
|
||||
}
|
||||
// log.Println("stats", stats)
|
||||
for j := range validAlerts {
|
||||
alert := &validAlerts[j]
|
||||
// reset alert val on first iteration
|
||||
if i == 0 {
|
||||
alert.val = 0
|
||||
}
|
||||
// continue if system_stats is older than alert time range
|
||||
if systemStatsCreation.Before(alert.time) {
|
||||
continue
|
||||
}
|
||||
// add to alert value
|
||||
switch alert.name {
|
||||
case "CPU":
|
||||
alert.val += stats.Cpu
|
||||
case "Memory":
|
||||
alert.val += stats.Mem
|
||||
case "Bandwidth":
|
||||
alert.val += stats.NetSent + stats.NetRecv
|
||||
case "Disk":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(extraFs)+1)
|
||||
}
|
||||
// add root disk
|
||||
if _, ok := alert.mapSums["root"]; !ok {
|
||||
alert.mapSums["root"] = 0.0
|
||||
}
|
||||
alert.mapSums["root"] += float32(stats.Disk)
|
||||
// add extra disks
|
||||
for key, fs := range extraFs {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = 0.0
|
||||
}
|
||||
alert.mapSums[key] += float32(fs.DiskUsed / fs.DiskTotal * 100)
|
||||
}
|
||||
case "Temperature":
|
||||
if alert.mapSums == nil {
|
||||
alert.mapSums = make(map[string]float32, len(stats.Temperatures))
|
||||
}
|
||||
for key, temp := range stats.Temperatures {
|
||||
if _, ok := alert.mapSums[key]; !ok {
|
||||
alert.mapSums[key] = float32(0)
|
||||
}
|
||||
alert.mapSums[key] += temp
|
||||
}
|
||||
default:
|
||||
continue
|
||||
}
|
||||
alert.count++
|
||||
}
|
||||
}
|
||||
// sum up vals for each alert
|
||||
for _, alert := range validAlerts {
|
||||
switch alert.name {
|
||||
case "Disk":
|
||||
maxPct := float32(0)
|
||||
for key, value := range alert.mapSums {
|
||||
sumPct := float32(value)
|
||||
if sumPct > maxPct {
|
||||
maxPct = sumPct
|
||||
alert.descriptor = fmt.Sprintf("Usage of %s", key)
|
||||
}
|
||||
}
|
||||
alert.val = float64(maxPct / float32(alert.count))
|
||||
case "Temperature":
|
||||
maxTemp := float32(0)
|
||||
for key, value := range alert.mapSums {
|
||||
sumTemp := float32(value) / float32(alert.count)
|
||||
if sumTemp > maxTemp {
|
||||
maxTemp = sumTemp
|
||||
alert.descriptor = fmt.Sprintf("Highest sensor %s", key)
|
||||
}
|
||||
}
|
||||
alert.val = float64(maxTemp)
|
||||
default:
|
||||
alert.val = alert.val / float64(alert.count)
|
||||
}
|
||||
minCount := float32(alert.min) / 1.2
|
||||
// log.Println("alert", alert.name, "val", alert.val, "threshold", alert.threshold, "triggered", alert.triggered)
|
||||
// log.Printf("%s: val %f | count %d | min-count %f | threshold %f\n", alert.name, alert.val, alert.count, minCount, alert.threshold)
|
||||
// pass through alert if count is greater than or equal to minCount
|
||||
if float32(alert.count) >= minCount {
|
||||
if !alert.triggered && alert.val > alert.threshold {
|
||||
alert.triggered = true
|
||||
go am.sendSystemAlert(alert)
|
||||
} else if alert.triggered && alert.val <= alert.threshold {
|
||||
alert.triggered = false
|
||||
go am.sendSystemAlert(alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (am *AlertManager) sendSystemAlert(alert SystemAlertData) {
|
||||
// log.Printf("Sending alert %s: val %f | count %d | threshold %f\n", alert.name, alert.val, alert.count, alert.threshold)
|
||||
systemName := alert.systemRecord.GetString("name")
|
||||
|
||||
// change Disk to Disk usage
|
||||
if alert.name == "Disk" {
|
||||
alert.name += " usage"
|
||||
}
|
||||
|
||||
// make title alert name lowercase if not CPU
|
||||
titleAlertName := alert.name
|
||||
if titleAlertName != "CPU" {
|
||||
titleAlertName = strings.ToLower(titleAlertName)
|
||||
}
|
||||
|
||||
var subject string
|
||||
if alert.triggered {
|
||||
subject = fmt.Sprintf("%s %s above threshold", systemName, titleAlertName)
|
||||
} else {
|
||||
subject = fmt.Sprintf("%s %s below threshold", systemName, titleAlertName)
|
||||
}
|
||||
minutesLabel := "minute"
|
||||
if alert.min > 1 {
|
||||
minutesLabel += "s"
|
||||
}
|
||||
if alert.descriptor == "" {
|
||||
alert.descriptor = alert.name
|
||||
}
|
||||
body := fmt.Sprintf("%s averaged %.2f%s for the previous %v %s.", alert.descriptor, alert.val, alert.unit, alert.min, minutesLabel)
|
||||
|
||||
alert.alertRecord.Set("triggered", alert.triggered)
|
||||
if err := am.app.Save(alert.alertRecord); err != nil {
|
||||
// app.Logger().Error("failed to save alert record", "err", err.Error())
|
||||
return
|
||||
}
|
||||
// expand the user relation and send the alert
|
||||
if errs := am.app.ExpandRecord(alert.alertRecord, []string{"user"}, nil); len(errs) > 0 {
|
||||
// app.Logger().Error("failed to expand user relation", "errs", errs)
|
||||
return
|
||||
}
|
||||
if user := alert.alertRecord.ExpandedOne("user"); user != nil {
|
||||
am.SendAlert(AlertMessageData{
|
||||
UserID: user.Id,
|
||||
Title: subject,
|
||||
Message: body,
|
||||
Link: am.app.Settings().Meta.AppURL + "/system/" + url.PathEscape(systemName),
|
||||
LinkText: "View " + systemName,
|
||||
})
|
||||
}
|
||||
}
|
@@ -160,13 +160,11 @@ export function SystemAlertGlobal({
|
||||
function AlertContent({ data }: { data: AlertData }) {
|
||||
const { key } = data
|
||||
|
||||
const hasSliders = !("single" in data.alert)
|
||||
const singleDescription = data.alert.singleDesc?.()
|
||||
|
||||
const [checked, setChecked] = useState(data.checked || false)
|
||||
const [min, setMin] = useState(data.min || (hasSliders ? 10 : 0))
|
||||
const [value, setValue] = useState(data.val || (hasSliders ? 80 : 0))
|
||||
|
||||
const showSliders = checked && hasSliders
|
||||
const [min, setMin] = useState(data.min || 10)
|
||||
const [value, setValue] = useState(data.val || (singleDescription ? 0 : 80))
|
||||
|
||||
const newMin = useRef(min)
|
||||
const newValue = useRef(value)
|
||||
@@ -180,14 +178,14 @@ function AlertContent({ data }: { data: AlertData }) {
|
||||
<label
|
||||
htmlFor={`s${key}`}
|
||||
className={cn("flex flex-row items-center justify-between gap-4 cursor-pointer p-4", {
|
||||
"pb-0": showSliders,
|
||||
"pb-0": checked,
|
||||
})}
|
||||
>
|
||||
<div className="grid gap-1 select-none">
|
||||
<p className="font-semibold flex gap-3 items-center">
|
||||
<Icon className="h-4 w-4 opacity-85" /> {data.alert.name()}
|
||||
</p>
|
||||
{!showSliders && <span className="block text-sm text-muted-foreground">{data.alert.desc()}</span>}
|
||||
{!checked && <span className="block text-sm text-muted-foreground">{data.alert.desc()}</span>}
|
||||
</div>
|
||||
<Switch
|
||||
id={`s${key}`}
|
||||
@@ -198,9 +196,10 @@ function AlertContent({ data }: { data: AlertData }) {
|
||||
}}
|
||||
/>
|
||||
</label>
|
||||
{showSliders && (
|
||||
{checked && (
|
||||
<div className="grid sm:grid-cols-2 mt-1.5 gap-5 px-4 pb-5 tabular-nums text-muted-foreground">
|
||||
<Suspense fallback={<div className="h-10" />}>
|
||||
{!singleDescription && (
|
||||
<div>
|
||||
<p id={`v${key}`} className="text-sm block h-8">
|
||||
<Trans>
|
||||
@@ -222,8 +221,12 @@ function AlertContent({ data }: { data: AlertData }) {
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<p id={`t${key}`} className="text-sm block h-8">
|
||||
)}
|
||||
<div className={cn(singleDescription && "col-span-full lowercase")}>
|
||||
<p id={`t${key}`} className="text-sm block h-8 first-letter:uppercase">
|
||||
{singleDescription && (
|
||||
<>{singleDescription}{` `}</>
|
||||
)}
|
||||
<Trans>
|
||||
For <strong className="text-foreground">{min}</strong>{" "}
|
||||
<Plural value={min} one=" minute" other=" minutes" />
|
||||
|
@@ -302,6 +302,13 @@ export default function SystemDetail({ name }: { name: string }) {
|
||||
const hasGpuData = lastGpuVals.length > 0
|
||||
const hasGpuPowerData = lastGpuVals.some((gpu) => gpu.p !== undefined)
|
||||
|
||||
let translatedStatus: string = system.status
|
||||
if (system.status === "up") {
|
||||
translatedStatus = t({ message: "Up", comment: "Context: System is up" })
|
||||
} else if (system.status === "down") {
|
||||
translatedStatus = t({ message: "Down", comment: "Context: System is down" })
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
<div id="chartwrap" className="grid gap-4 mb-10 overflow-x-clip">
|
||||
@@ -328,7 +335,7 @@ export default function SystemDetail({ name }: { name: string }) {
|
||||
})}
|
||||
></span>
|
||||
</span>
|
||||
{system.status}
|
||||
{translatedStatus}
|
||||
</div>
|
||||
{systemInfo.map(({ value, label, Icon, hide }, i) => {
|
||||
if (hide || !value) {
|
||||
|
@@ -302,7 +302,8 @@ export const alertInfo: Record<string, AlertInfo> = {
|
||||
unit: "",
|
||||
icon: ServerIcon,
|
||||
desc: () => t`Triggers when status switches between up and down`,
|
||||
single: true,
|
||||
/** "for x minutes" is appended to desc when only one value */
|
||||
singleDesc: () => t`System` + " " + t`Down`,
|
||||
},
|
||||
CPU: {
|
||||
name: () => t`CPU Usage`,
|
||||
|
3
beszel/site/src/types.d.ts
vendored
3
beszel/site/src/types.d.ts
vendored
@@ -200,6 +200,7 @@ interface AlertInfo {
|
||||
unit: string
|
||||
icon: any
|
||||
desc: () => string
|
||||
single?: boolean
|
||||
max?: number
|
||||
/** Single value description (when there's only one value, like status) */
|
||||
singleDesc?: () => string
|
||||
}
|
||||
|
Reference in New Issue
Block a user