From 18aeb218084d030b0917729d60fd7f9c429d3b53 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Mon, 1 Jun 2026 23:42:01 +0200 Subject: [PATCH] ci: drop fragile event-warnings gate; rely on Flux reconcile status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check-event-warnings composite action sampled Warning events after a settle and failed the deploy if any fired since a marker. It was a heuristic layered on top of reconciliation and produced false positives: a one-shot warning emitted DURING the settle window — e.g. a Flux controller's /readyz blipping "connection refused" while it restarts to pick up the #1661 spread-policy mutation — was flagged even though it self-healed in seconds. It was also redundant. `ksail workload reconcile` already triggers and WAITS for completion, tracking the OCIRepository and each Kustomization individually with a timeout. Every Flux Kustomization here (variables -> infrastructure-controllers -> infrastructure -> apps) is `wait: true`, so a Kustomization only reports Ready once Flux's own health checks pass on everything it applied — Deployments, StatefulSets and HelmReleases alike. That is why a stalled HelmRelease (e.g. OpenBao RetriesExceeded) surfaces as `infrastructure-controllers HealthCheckFailed` and fails the reconcile. Flux's Ready/Stalled conditions are the authoritative, heuristic-free signal, and they catch pre-existing unhealth the event marker never could. Remove the action and its three call sites (ci.yaml system-test + merge_group, cd.yaml). The reconcile step is now the deploy gate; the existing "Diagnose Flux on failure" steps still fire on its failure. Co-Authored-By: Claude Opus 4.8 --- .../actions/check-event-warnings/action.yaml | 93 ------------------- .github/workflows/cd.yaml | 8 -- .github/workflows/ci.yaml | 14 --- 3 files changed, 115 deletions(-) delete mode 100644 .github/actions/check-event-warnings/action.yaml diff --git a/.github/actions/check-event-warnings/action.yaml b/.github/actions/check-event-warnings/action.yaml deleted file mode 100644 index 51800d689..000000000 --- a/.github/actions/check-event-warnings/action.yaml +++ /dev/null @@ -1,93 +0,0 @@ -name: Check for new event warnings -description: > - Fail when a cluster is still emitting Warning events after a successful - deployment. Records a marker timestamp, lets the cluster settle, then - inspects Warning events whose most-recent occurrence is at/after the marker — - i.e. warnings still firing at steady state (crash loops, repeated probe - failures, image back-off, etc.). Transient one-shot warnings emitted during - bootstrap are ignored because they fired before the marker. The full Warning - history is always printed for context. - -inputs: - context: - description: kubectl context to target. Empty uses the kubeconfig current-context. - required: false - default: "" - settle-seconds: - description: How long to let the cluster settle before sampling Warning events. - required: false - default: "90" - fail-on-warning: - description: Fail the step when steady-state Warning events are found. - required: false - default: "true" - -runs: - using: composite - steps: - - name: 🔎 Check for new event warnings - shell: bash - env: - KUBECTL_CONTEXT: ${{ inputs.context }} - SETTLE_SECONDS: ${{ inputs.settle-seconds }} - FAIL_ON_WARNING: ${{ inputs.fail-on-warning }} - run: | - set -euo pipefail - - kc=(kubectl) - if [ -n "${KUBECTL_CONTEXT}" ]; then - kc+=(--context "${KUBECTL_CONTEXT}") - fi - - # Record the marker BEFORE settling so the observation window is exactly - # the settle period. RFC3339 UTC timestamps compare correctly as strings - # when truncated to second precision (fixed-width prefix). - marker=$(date -u +%Y-%m-%dT%H:%M:%SZ) - echo "Marker: ${marker} — letting the cluster settle for ${SETTLE_SECONDS}s, then sampling Warning events that fired at/after the marker." - sleep "${SETTLE_SECONDS}" - - events_json=$("${kc[@]}" get events -A -o json) - - # Normalise both event shapes (core/v1 Event and events.k8s.io/v1) and - # keep only Warnings whose most-recent occurrence is at/after the marker. - new_warnings=$(printf '%s' "${events_json}" | jq -c --arg marker "${marker}" ' - [ .items[] - | select(.type == "Warning") - | (.series.lastObservedTime // .lastTimestamp // .deprecatedLastTimestamp // .eventTime // .metadata.creationTimestamp) as $ts - | select($ts != null and ($ts[0:19]) >= ($marker[0:19])) - | { ns: (.metadata.namespace // .involvedObject.namespace // .regarding.namespace // "-"), - reason: (.reason // "-"), - obj: "\(.involvedObject.kind // .regarding.kind // "?")/\(.involvedObject.name // .regarding.name // "?")", - msg: ((.message // .note // "") | gsub("\\s+"; " ") | .[0:300]), - count: (.count // .series.count // .deprecatedCount // 1), - ts: $ts } - ] - | sort_by(.ts) - ') - count=$(printf '%s' "${new_warnings}" | jq 'length') - - echo "::group::New Warning events since ${marker} (${count})" - if [ "${count}" -eq 0 ]; then - echo "None — no Warning events fired during the ${SETTLE_SECONDS}s steady-state window. ✅" - else - printf '%s' "${new_warnings}" | jq -r '.[] | "\(.ts) [\(.ns)] \(.obj) \(.reason) (x\(.count)): \(.msg)"' - fi - echo "::endgroup::" - - # Always print the full Warning history (any time) for context — report-only. - echo "::group::All Warning events (history, report-only)" - "${kc[@]}" get events -A --field-selector type=Warning --sort-by=.lastTimestamp 2>/dev/null | tail -200 || echo "(none / unavailable)" - echo "::endgroup::" - - if [ "${count}" -gt 0 ]; then - # Per-reason occurrence totals: sum .count rather than count event - # objects, since one crash-looping pod is a single Event with - # count=N. jq's group_by sorts its input internally, so the timestamp - # ordering of new_warnings never splits a reason across groups. - summary=$(printf '%s' "${new_warnings}" | jq -r 'group_by(.reason) | map("\(.[0].reason) ×\(map(.count) | add)") | join(", ")') - if [ "${FAIL_ON_WARNING}" = "true" ]; then - echo "::error title=New event warnings after deploy::${count} distinct Warning event(s) still firing at steady state (by reason, ×=occurrences): ${summary}. See the 'Check for new event warnings' step log." - exit 1 - fi - echo "::warning title=New event warnings after deploy::${count} distinct Warning event(s) still firing at steady state (by reason, ×=occurrences): ${summary}. (report-only)" - fi diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index 1d88d0a24..73fc7bc0e 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -262,14 +262,6 @@ jobs: GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} - # After a successful prod reconcile, require the live cluster to stop - # emitting Warning events. Pinned to the same admin@prod context the rest - # of the prod steps use. - - name: 🔎 Require no new event warnings - uses: ./.github/actions/check-event-warnings - with: - context: admin@prod - - name: 🩺 Diagnose Flux on failure if: failure() && steps.reconcile.conclusion == 'failure' env: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3380fb0e4..81cca4628 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -83,12 +83,6 @@ jobs: reconcile: "true" delete: "false" - # Gate on a quiet steady state: after a successful reconcile the cluster - # should stop emitting Warning events. Catches crash loops / probe - # failures / back-off that bootstrap warnings would otherwise mask. - - name: 🔎 Require no new event warnings - uses: ./.github/actions/check-event-warnings - - name: 🩺 Diagnose Flux on failure if: failure() run: | @@ -294,14 +288,6 @@ jobs: GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }} HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} - # After a successful prod reconcile, require the live cluster to stop - # emitting Warning events. Pinned to the same admin@prod context the rest - # of the prod steps use. - - name: 🔎 Require no new event warnings - uses: ./.github/actions/check-event-warnings - with: - context: admin@prod - - name: 🩺 Diagnose Flux on failure if: failure() && steps.reconcile.outcome == 'failure' env: