retry failed containers separately so we can run them in parallel (#58)

This commit is contained in:
Henry Dollman
2024-10-21 17:00:13 -04:00
parent 5f4dcb09ea
commit 539c0ccb1d

View File

@@ -60,6 +60,8 @@ func (dm *dockerManager) getDockerStats() ([]*container.Stats, error) {
clear(dm.validIds) clear(dm.validIds)
} }
var failedContainters []container.ApiInfo
for _, ctr := range *dm.apiContainerList { for _, ctr := range *dm.apiContainerList {
ctr.IdShort = ctr.Id[:12] ctr.IdShort = ctr.Id[:12]
dm.validIds[ctr.IdShort] = struct{}{} dm.validIds[ctr.IdShort] = struct{}{}
@@ -74,18 +76,33 @@ func (dm *dockerManager) getDockerStats() ([]*container.Stats, error) {
defer dm.dequeue() defer dm.dequeue()
err := dm.updateContainerStats(ctr) err := dm.updateContainerStats(ctr)
if err != nil { if err != nil {
dm.deleteContainerStatsSync(ctr.IdShort) dm.containerStatsMutex.Lock()
// retry once delete(dm.containerStatsMap, ctr.IdShort)
err = dm.updateContainerStats(ctr) failedContainters = append(failedContainters, ctr)
if err != nil { dm.containerStatsMutex.Unlock()
slog.Error("Error getting container stats", "err", err)
}
} }
}() }()
} }
dm.wg.Wait() dm.wg.Wait()
// retry failed containers separately so we can run them in parallel (docker 24 bug)
if len(failedContainters) > 0 {
slog.Debug("Retrying failed containers", "count", len(failedContainters))
// time.Sleep(time.Millisecond * 1100)
for _, ctr := range failedContainters {
dm.wg.Add(1)
go func() {
defer dm.wg.Done()
err = dm.updateContainerStats(ctr)
if err != nil {
slog.Error("Error getting container stats", "err", err)
}
}()
}
dm.wg.Wait()
}
// populate final stats and remove old / invalid container stats // populate final stats and remove old / invalid container stats
stats := make([]*container.Stats, 0, containersLength) stats := make([]*container.Stats, 0, containersLength)
for id, v := range dm.containerStatsMap { for id, v := range dm.containerStatsMap {