From 6d1cec3c42085c26d906b1521c36e49e18ef4d70 Mon Sep 17 00:00:00 2001 From: henrygd Date: Tue, 8 Jul 2025 16:43:33 -0400 Subject: [PATCH] new agent healthcheck to support non-ssh connections --- beszel/cmd/agent/agent.go | 18 +-- beszel/internal/agent/health.go | 18 --- beszel/internal/agent/health/health.go | 43 +++++++ beszel/internal/agent/health/health_test.go | 67 +++++++++++ beszel/internal/agent/health_test.go | 118 -------------------- 5 files changed, 119 insertions(+), 145 deletions(-) delete mode 100644 beszel/internal/agent/health.go create mode 100644 beszel/internal/agent/health/health.go create mode 100644 beszel/internal/agent/health/health_test.go delete mode 100644 beszel/internal/agent/health_test.go diff --git a/beszel/cmd/agent/agent.go b/beszel/cmd/agent/agent.go index 2f5b3c1..0e26a83 100644 --- a/beszel/cmd/agent/agent.go +++ b/beszel/cmd/agent/agent.go @@ -3,6 +3,7 @@ package main import ( "beszel" "beszel/internal/agent" + "beszel/internal/agent/health" "flag" "fmt" "log" @@ -50,12 +51,7 @@ func (opts *cmdOptions) parse() bool { agent.Update() return true case "health": - // for health, we need to parse flags first to get the listen address - args := append(os.Args[2:], subcommand) - flag.CommandLine.Parse(args) - addr := opts.getAddress() - network := agent.GetNetwork(addr) - err := agent.Health(addr, network) + err := health.Check() if err != nil { log.Fatal(err) } @@ -115,8 +111,12 @@ func main() { serverConfig.Addr = addr serverConfig.Network = agent.GetNetwork(addr) - agent := agent.NewAgent() - if err := agent.StartServer(serverConfig); err != nil { - log.Fatal("Failed to start server:", err) + agent, err := agent.NewAgent("") + if err != nil { + log.Fatal("Failed to create agent: ", err) + } + + if err := agent.Start(serverConfig); err != nil { + log.Fatal("Failed to start server: ", err) } } diff --git a/beszel/internal/agent/health.go b/beszel/internal/agent/health.go deleted file mode 100644 index 5e6450e..0000000 --- a/beszel/internal/agent/health.go +++ /dev/null @@ -1,18 +0,0 @@ -package agent - -import ( - "net" - "time" -) - -// Health checks if the agent's server is running by attempting to connect to it. -// -// If an error occurs when attempting to connect to the server, it returns the error. -func Health(addr string, network string) error { - conn, err := net.DialTimeout(network, addr, 4*time.Second) - if err != nil { - return err - } - conn.Close() - return nil -} diff --git a/beszel/internal/agent/health/health.go b/beszel/internal/agent/health/health.go new file mode 100644 index 0000000..f2ccda7 --- /dev/null +++ b/beszel/internal/agent/health/health.go @@ -0,0 +1,43 @@ +// Package health provides functions to check and update the health of the agent. +// It uses a file in the temp directory to store the timestamp of the last connection attempt. +// If the timestamp is older than 90 seconds, the agent is considered unhealthy. +// NB: The agent must be started with the Start() method to be considered healthy. +package health + +import ( + "errors" + "log" + "os" + "path/filepath" + "time" +) + +// healthFile is the path to the health file +var healthFile = filepath.Join(os.TempDir(), "beszel_health") + +// Check checks if the agent is connected by checking the modification time of the health file +func Check() error { + fileInfo, err := os.Stat(healthFile) + if err != nil { + return err + } + if time.Since(fileInfo.ModTime()) > 91*time.Second { + log.Println("over 90 seconds since last connection") + return errors.New("unhealthy") + } + return nil +} + +// Update updates the modification time of the health file +func Update() error { + file, err := os.Create(healthFile) + if err != nil { + return err + } + return file.Close() +} + +// CleanUp removes the health file +func CleanUp() error { + return os.Remove(healthFile) +} diff --git a/beszel/internal/agent/health/health_test.go b/beszel/internal/agent/health/health_test.go new file mode 100644 index 0000000..4c57a1b --- /dev/null +++ b/beszel/internal/agent/health/health_test.go @@ -0,0 +1,67 @@ +//go:build testing +// +build testing + +package health + +import ( + "os" + "path/filepath" + "testing" + "time" + + "testing/synctest" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHealth(t *testing.T) { + // Override healthFile to use a temporary directory for this test. + originalHealthFile := healthFile + tmpDir := t.TempDir() + healthFile = filepath.Join(tmpDir, "beszel_health_test") + defer func() { healthFile = originalHealthFile }() + + t.Run("check with no health file", func(t *testing.T) { + err := Check() + require.Error(t, err) + assert.True(t, os.IsNotExist(err), "expected a file-not-exist error, but got: %v", err) + }) + + t.Run("update and check", func(t *testing.T) { + err := Update() + require.NoError(t, err, "Update() failed") + + err = Check() + assert.NoError(t, err, "Check() failed immediately after Update()") + }) + + // This test uses synctest to simulate time passing. + // NOTE: This test requires GOEXPERIMENT=synctest to run. + t.Run("check with simulated time", func(t *testing.T) { + synctest.Run(func() { + // Update the file to set the initial timestamp. + require.NoError(t, Update(), "Update() failed inside synctest") + + // Set the mtime to the current fake time to align the file's timestamp with the simulated clock. + now := time.Now() + require.NoError(t, os.Chtimes(healthFile, now, now), "Chtimes failed") + + // Wait a duration less than the threshold. + time.Sleep(89 * time.Second) + synctest.Wait() + + // The check should still pass. + assert.NoError(t, Check(), "Check() failed after 89s") + + // Wait for the total duration to exceed the threshold. + time.Sleep(5 * time.Second) + synctest.Wait() + + // The check should now fail as unhealthy. + err := Check() + require.Error(t, err, "Check() should have failed after 91s") + assert.Equal(t, "unhealthy", err.Error(), "Check() returned wrong error") + }) + }) +} diff --git a/beszel/internal/agent/health_test.go b/beszel/internal/agent/health_test.go deleted file mode 100644 index 2e4d36d..0000000 --- a/beszel/internal/agent/health_test.go +++ /dev/null @@ -1,118 +0,0 @@ -//go:build testing -// +build testing - -package agent_test - -import ( - "fmt" - "net" - "os" - "testing" - - "github.com/stretchr/testify/require" - - "beszel/internal/agent" -) - -// setupTestServer creates a temporary server for testing -func setupTestServer(t *testing.T) (string, func()) { - // Create a temporary socket file for Unix socket testing - tempSockFile := os.TempDir() + "/beszel_health_test.sock" - - // Clean up any existing socket file - os.Remove(tempSockFile) - - // Create a listener - listener, err := net.Listen("unix", tempSockFile) - require.NoError(t, err, "Failed to create test listener") - - // Start a simple server in a goroutine - go func() { - conn, err := listener.Accept() - if err != nil { - return // Listener closed - } - defer conn.Close() - // Just accept the connection and do nothing - }() - - // Return the socket file path and a cleanup function - return tempSockFile, func() { - listener.Close() - os.Remove(tempSockFile) - } -} - -// setupTCPTestServer creates a temporary TCP server for testing -func setupTCPTestServer(t *testing.T) (string, func()) { - // Listen on a random available port - listener, err := net.Listen("tcp", "127.0.0.1:0") - require.NoError(t, err, "Failed to create test listener") - - // Get the port that was assigned - addr := listener.Addr().(*net.TCPAddr) - port := addr.Port - - // Start a simple server in a goroutine - go func() { - conn, err := listener.Accept() - if err != nil { - return // Listener closed - } - defer conn.Close() - // Just accept the connection and do nothing - }() - - // Return the address and a cleanup function - return fmt.Sprintf("127.0.0.1:%d", port), func() { - listener.Close() - } -} - -func TestHealth(t *testing.T) { - t.Run("server is running (unix socket)", func(t *testing.T) { - // Setup a test server - sockFile, cleanup := setupTestServer(t) - defer cleanup() - - // Run the health check with explicit parameters - err := agent.Health(sockFile, "unix") - require.NoError(t, err, "Failed to check health") - }) - - t.Run("server is running (tcp address)", func(t *testing.T) { - // Setup a test server - addr, cleanup := setupTCPTestServer(t) - defer cleanup() - - // Run the health check with explicit parameters - err := agent.Health(addr, "tcp") - require.NoError(t, err, "Failed to check health") - }) - - t.Run("server is not running", func(t *testing.T) { - // Use an address that's likely not in use - addr := "127.0.0.1:65535" - - // Run the health check with explicit parameters - err := agent.Health(addr, "tcp") - require.Error(t, err, "Health check should return an error when server is not running") - }) - - t.Run("invalid network", func(t *testing.T) { - // Use an invalid network type - err := agent.Health("127.0.0.1:8080", "invalid_network") - require.Error(t, err, "Health check should return an error with invalid network") - }) - - t.Run("unix socket not found", func(t *testing.T) { - // Use a non-existent unix socket - nonExistentSocket := os.TempDir() + "/non_existent_socket.sock" - - // Make sure it really doesn't exist - os.Remove(nonExistentSocket) - - err := agent.Health(nonExistentSocket, "unix") - require.Error(t, err, "Health check should return an error when socket doesn't exist") - }) -}