From 539c0ccb1d7b49468bf12c56990198b82c69daad Mon Sep 17 00:00:00 2001 From: Henry Dollman Date: Mon, 21 Oct 2024 17:00:13 -0400 Subject: [PATCH] retry failed containers separately so we can run them in parallel (#58) --- beszel/internal/agent/docker.go | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/beszel/internal/agent/docker.go b/beszel/internal/agent/docker.go index 2c3d819..178951c 100644 --- a/beszel/internal/agent/docker.go +++ b/beszel/internal/agent/docker.go @@ -60,6 +60,8 @@ func (dm *dockerManager) getDockerStats() ([]*container.Stats, error) { clear(dm.validIds) } + var failedContainters []container.ApiInfo + for _, ctr := range *dm.apiContainerList { ctr.IdShort = ctr.Id[:12] dm.validIds[ctr.IdShort] = struct{}{} @@ -74,18 +76,33 @@ func (dm *dockerManager) getDockerStats() ([]*container.Stats, error) { defer dm.dequeue() err := dm.updateContainerStats(ctr) if err != nil { - dm.deleteContainerStatsSync(ctr.IdShort) - // retry once - err = dm.updateContainerStats(ctr) - if err != nil { - slog.Error("Error getting container stats", "err", err) - } + dm.containerStatsMutex.Lock() + delete(dm.containerStatsMap, ctr.IdShort) + failedContainters = append(failedContainters, ctr) + dm.containerStatsMutex.Unlock() } }() } dm.wg.Wait() + // retry failed containers separately so we can run them in parallel (docker 24 bug) + if len(failedContainters) > 0 { + slog.Debug("Retrying failed containers", "count", len(failedContainters)) + // time.Sleep(time.Millisecond * 1100) + for _, ctr := range failedContainters { + dm.wg.Add(1) + go func() { + defer dm.wg.Done() + err = dm.updateContainerStats(ctr) + if err != nil { + slog.Error("Error getting container stats", "err", err) + } + }() + } + dm.wg.Wait() + } + // populate final stats and remove old / invalid container stats stats := make([]*container.Stats, 0, containersLength) for id, v := range dm.containerStatsMap {