new agent healthcheck to support non-ssh connections

This commit is contained in:
henrygd
2025-07-08 16:43:33 -04:00
parent 529df84273
commit 6d1cec3c42
5 changed files with 119 additions and 145 deletions

View File

@@ -3,6 +3,7 @@ package main
import (
"beszel"
"beszel/internal/agent"
"beszel/internal/agent/health"
"flag"
"fmt"
"log"
@@ -50,12 +51,7 @@ func (opts *cmdOptions) parse() bool {
agent.Update()
return true
case "health":
// for health, we need to parse flags first to get the listen address
args := append(os.Args[2:], subcommand)
flag.CommandLine.Parse(args)
addr := opts.getAddress()
network := agent.GetNetwork(addr)
err := agent.Health(addr, network)
err := health.Check()
if err != nil {
log.Fatal(err)
}
@@ -115,8 +111,12 @@ func main() {
serverConfig.Addr = addr
serverConfig.Network = agent.GetNetwork(addr)
agent := agent.NewAgent()
if err := agent.StartServer(serverConfig); err != nil {
log.Fatal("Failed to start server:", err)
agent, err := agent.NewAgent("")
if err != nil {
log.Fatal("Failed to create agent: ", err)
}
if err := agent.Start(serverConfig); err != nil {
log.Fatal("Failed to start server: ", err)
}
}

View File

@@ -1,18 +0,0 @@
package agent
import (
"net"
"time"
)
// Health checks if the agent's server is running by attempting to connect to it.
//
// If an error occurs when attempting to connect to the server, it returns the error.
func Health(addr string, network string) error {
conn, err := net.DialTimeout(network, addr, 4*time.Second)
if err != nil {
return err
}
conn.Close()
return nil
}

View File

@@ -0,0 +1,43 @@
// Package health provides functions to check and update the health of the agent.
// It uses a file in the temp directory to store the timestamp of the last connection attempt.
// If the timestamp is older than 90 seconds, the agent is considered unhealthy.
// NB: The agent must be started with the Start() method to be considered healthy.
package health
import (
"errors"
"log"
"os"
"path/filepath"
"time"
)
// healthFile is the path to the health file
var healthFile = filepath.Join(os.TempDir(), "beszel_health")
// Check checks if the agent is connected by checking the modification time of the health file
func Check() error {
fileInfo, err := os.Stat(healthFile)
if err != nil {
return err
}
if time.Since(fileInfo.ModTime()) > 91*time.Second {
log.Println("over 90 seconds since last connection")
return errors.New("unhealthy")
}
return nil
}
// Update updates the modification time of the health file
func Update() error {
file, err := os.Create(healthFile)
if err != nil {
return err
}
return file.Close()
}
// CleanUp removes the health file
func CleanUp() error {
return os.Remove(healthFile)
}

View File

@@ -0,0 +1,67 @@
//go:build testing
// +build testing
package health
import (
"os"
"path/filepath"
"testing"
"time"
"testing/synctest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestHealth(t *testing.T) {
// Override healthFile to use a temporary directory for this test.
originalHealthFile := healthFile
tmpDir := t.TempDir()
healthFile = filepath.Join(tmpDir, "beszel_health_test")
defer func() { healthFile = originalHealthFile }()
t.Run("check with no health file", func(t *testing.T) {
err := Check()
require.Error(t, err)
assert.True(t, os.IsNotExist(err), "expected a file-not-exist error, but got: %v", err)
})
t.Run("update and check", func(t *testing.T) {
err := Update()
require.NoError(t, err, "Update() failed")
err = Check()
assert.NoError(t, err, "Check() failed immediately after Update()")
})
// This test uses synctest to simulate time passing.
// NOTE: This test requires GOEXPERIMENT=synctest to run.
t.Run("check with simulated time", func(t *testing.T) {
synctest.Run(func() {
// Update the file to set the initial timestamp.
require.NoError(t, Update(), "Update() failed inside synctest")
// Set the mtime to the current fake time to align the file's timestamp with the simulated clock.
now := time.Now()
require.NoError(t, os.Chtimes(healthFile, now, now), "Chtimes failed")
// Wait a duration less than the threshold.
time.Sleep(89 * time.Second)
synctest.Wait()
// The check should still pass.
assert.NoError(t, Check(), "Check() failed after 89s")
// Wait for the total duration to exceed the threshold.
time.Sleep(5 * time.Second)
synctest.Wait()
// The check should now fail as unhealthy.
err := Check()
require.Error(t, err, "Check() should have failed after 91s")
assert.Equal(t, "unhealthy", err.Error(), "Check() returned wrong error")
})
})
}

View File

@@ -1,118 +0,0 @@
//go:build testing
// +build testing
package agent_test
import (
"fmt"
"net"
"os"
"testing"
"github.com/stretchr/testify/require"
"beszel/internal/agent"
)
// setupTestServer creates a temporary server for testing
func setupTestServer(t *testing.T) (string, func()) {
// Create a temporary socket file for Unix socket testing
tempSockFile := os.TempDir() + "/beszel_health_test.sock"
// Clean up any existing socket file
os.Remove(tempSockFile)
// Create a listener
listener, err := net.Listen("unix", tempSockFile)
require.NoError(t, err, "Failed to create test listener")
// Start a simple server in a goroutine
go func() {
conn, err := listener.Accept()
if err != nil {
return // Listener closed
}
defer conn.Close()
// Just accept the connection and do nothing
}()
// Return the socket file path and a cleanup function
return tempSockFile, func() {
listener.Close()
os.Remove(tempSockFile)
}
}
// setupTCPTestServer creates a temporary TCP server for testing
func setupTCPTestServer(t *testing.T) (string, func()) {
// Listen on a random available port
listener, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err, "Failed to create test listener")
// Get the port that was assigned
addr := listener.Addr().(*net.TCPAddr)
port := addr.Port
// Start a simple server in a goroutine
go func() {
conn, err := listener.Accept()
if err != nil {
return // Listener closed
}
defer conn.Close()
// Just accept the connection and do nothing
}()
// Return the address and a cleanup function
return fmt.Sprintf("127.0.0.1:%d", port), func() {
listener.Close()
}
}
func TestHealth(t *testing.T) {
t.Run("server is running (unix socket)", func(t *testing.T) {
// Setup a test server
sockFile, cleanup := setupTestServer(t)
defer cleanup()
// Run the health check with explicit parameters
err := agent.Health(sockFile, "unix")
require.NoError(t, err, "Failed to check health")
})
t.Run("server is running (tcp address)", func(t *testing.T) {
// Setup a test server
addr, cleanup := setupTCPTestServer(t)
defer cleanup()
// Run the health check with explicit parameters
err := agent.Health(addr, "tcp")
require.NoError(t, err, "Failed to check health")
})
t.Run("server is not running", func(t *testing.T) {
// Use an address that's likely not in use
addr := "127.0.0.1:65535"
// Run the health check with explicit parameters
err := agent.Health(addr, "tcp")
require.Error(t, err, "Health check should return an error when server is not running")
})
t.Run("invalid network", func(t *testing.T) {
// Use an invalid network type
err := agent.Health("127.0.0.1:8080", "invalid_network")
require.Error(t, err, "Health check should return an error with invalid network")
})
t.Run("unix socket not found", func(t *testing.T) {
// Use a non-existent unix socket
nonExistentSocket := os.TempDir() + "/non_existent_socket.sock"
// Make sure it really doesn't exist
os.Remove(nonExistentSocket)
err := agent.Health(nonExistentSocket, "unix")
require.Error(t, err, "Health check should return an error when socket doesn't exist")
})
}