From 18aeb218084d030b0917729d60fd7f9c429d3b53 Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Mon, 1 Jun 2026 23:42:01 +0200
Subject: [PATCH] ci: drop fragile event-warnings gate; rely on Flux reconcile
 status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check-event-warnings composite action sampled Warning events after a
settle and failed the deploy if any fired since a marker. It was a heuristic
layered on top of reconciliation and produced false positives: a one-shot
warning emitted DURING the settle window — e.g. a Flux controller's /readyz
blipping "connection refused" while it restarts to pick up the #1661
spread-policy mutation — was flagged even though it self-healed in seconds.

It was also redundant. `ksail workload reconcile` already triggers and WAITS
for completion, tracking the OCIRepository and each Kustomization individually
with a timeout. Every Flux Kustomization here (variables ->
infrastructure-controllers -> infrastructure -> apps) is `wait: true`, so a
Kustomization only reports Ready once Flux's own health checks pass on
everything it applied — Deployments, StatefulSets and HelmReleases alike.
That is why a stalled HelmRelease (e.g. OpenBao RetriesExceeded) surfaces as
`infrastructure-controllers HealthCheckFailed` and fails the reconcile. Flux's
Ready/Stalled conditions are the authoritative, heuristic-free signal, and
they catch pre-existing unhealth the event marker never could.

Remove the action and its three call sites (ci.yaml system-test + merge_group,
cd.yaml). The reconcile step is now the deploy gate; the existing "Diagnose
Flux on failure" steps still fire on its failure.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../actions/check-event-warnings/action.yaml  | 93 -------------------
 .github/workflows/cd.yaml                     |  8 --
 .github/workflows/ci.yaml                     | 14 ---
 3 files changed, 115 deletions(-)
 delete mode 100644 .github/actions/check-event-warnings/action.yaml

diff --git a/.github/actions/check-event-warnings/action.yaml b/.github/actions/check-event-warnings/action.yaml
deleted file mode 100644
index 51800d689..000000000
--- a/.github/actions/check-event-warnings/action.yaml
+++ /dev/null
@@ -1,93 +0,0 @@
-name: Check for new event warnings
-description: >
-  Fail when a cluster is still emitting Warning events after a successful
-  deployment. Records a marker timestamp, lets the cluster settle, then
-  inspects Warning events whose most-recent occurrence is at/after the marker —
-  i.e. warnings still firing at steady state (crash loops, repeated probe
-  failures, image back-off, etc.). Transient one-shot warnings emitted during
-  bootstrap are ignored because they fired before the marker. The full Warning
-  history is always printed for context.
-
-inputs:
-  context:
-    description: kubectl context to target. Empty uses the kubeconfig current-context.
-    required: false
-    default: ""
-  settle-seconds:
-    description: How long to let the cluster settle before sampling Warning events.
-    required: false
-    default: "90"
-  fail-on-warning:
-    description: Fail the step when steady-state Warning events are found.
-    required: false
-    default: "true"
-
-runs:
-  using: composite
-  steps:
-    - name: 🔎 Check for new event warnings
-      shell: bash
-      env:
-        KUBECTL_CONTEXT: ${{ inputs.context }}
-        SETTLE_SECONDS: ${{ inputs.settle-seconds }}
-        FAIL_ON_WARNING: ${{ inputs.fail-on-warning }}
-      run: |
-        set -euo pipefail
-
-        kc=(kubectl)
-        if [ -n "${KUBECTL_CONTEXT}" ]; then
-          kc+=(--context "${KUBECTL_CONTEXT}")
-        fi
-
-        # Record the marker BEFORE settling so the observation window is exactly
-        # the settle period. RFC3339 UTC timestamps compare correctly as strings
-        # when truncated to second precision (fixed-width prefix).
-        marker=$(date -u +%Y-%m-%dT%H:%M:%SZ)
-        echo "Marker: ${marker} — letting the cluster settle for ${SETTLE_SECONDS}s, then sampling Warning events that fired at/after the marker."
-        sleep "${SETTLE_SECONDS}"
-
-        events_json=$("${kc[@]}" get events -A -o json)
-
-        # Normalise both event shapes (core/v1 Event and events.k8s.io/v1) and
-        # keep only Warnings whose most-recent occurrence is at/after the marker.
-        new_warnings=$(printf '%s' "${events_json}" | jq -c --arg marker "${marker}" '
-          [ .items[]
-            | select(.type == "Warning")
-            | (.series.lastObservedTime // .lastTimestamp // .deprecatedLastTimestamp // .eventTime // .metadata.creationTimestamp) as $ts
-            | select($ts != null and ($ts[0:19]) >= ($marker[0:19]))
-            | { ns:     (.metadata.namespace // .involvedObject.namespace // .regarding.namespace // "-"),
-                reason: (.reason // "-"),
-                obj:    "\(.involvedObject.kind // .regarding.kind // "?")/\(.involvedObject.name // .regarding.name // "?")",
-                msg:    ((.message // .note // "") | gsub("\\s+"; " ") | .[0:300]),
-                count:  (.count // .series.count // .deprecatedCount // 1),
-                ts:     $ts }
-          ]
-          | sort_by(.ts)
-        ')
-        count=$(printf '%s' "${new_warnings}" | jq 'length')
-
-        echo "::group::New Warning events since ${marker} (${count})"
-        if [ "${count}" -eq 0 ]; then
-          echo "None — no Warning events fired during the ${SETTLE_SECONDS}s steady-state window. ✅"
-        else
-          printf '%s' "${new_warnings}" | jq -r '.[] | "\(.ts)  [\(.ns)] \(.obj)  \(.reason) (x\(.count)): \(.msg)"'
-        fi
-        echo "::endgroup::"
-
-        # Always print the full Warning history (any time) for context — report-only.
-        echo "::group::All Warning events (history, report-only)"
-        "${kc[@]}" get events -A --field-selector type=Warning --sort-by=.lastTimestamp 2>/dev/null | tail -200 || echo "(none / unavailable)"
-        echo "::endgroup::"
-
-        if [ "${count}" -gt 0 ]; then
-          # Per-reason occurrence totals: sum .count rather than count event
-          # objects, since one crash-looping pod is a single Event with
-          # count=N. jq's group_by sorts its input internally, so the timestamp
-          # ordering of new_warnings never splits a reason across groups.
-          summary=$(printf '%s' "${new_warnings}" | jq -r 'group_by(.reason) | map("\(.[0].reason) ×\(map(.count) | add)") | join(", ")')
-          if [ "${FAIL_ON_WARNING}" = "true" ]; then
-            echo "::error title=New event warnings after deploy::${count} distinct Warning event(s) still firing at steady state (by reason, ×=occurrences): ${summary}. See the 'Check for new event warnings' step log."
-            exit 1
-          fi
-          echo "::warning title=New event warnings after deploy::${count} distinct Warning event(s) still firing at steady state (by reason, ×=occurrences): ${summary}. (report-only)"
-        fi
diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml
index 1d88d0a24..73fc7bc0e 100644
--- a/.github/workflows/cd.yaml
+++ b/.github/workflows/cd.yaml
@@ -262,14 +262,6 @@ jobs:
           GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
           HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
 
-      # After a successful prod reconcile, require the live cluster to stop
-      # emitting Warning events. Pinned to the same admin@prod context the rest
-      # of the prod steps use.
-      - name: 🔎 Require no new event warnings
-        uses: ./.github/actions/check-event-warnings
-        with:
-          context: admin@prod
-
       - name: 🩺 Diagnose Flux on failure
         if: failure() && steps.reconcile.conclusion == 'failure'
         env:
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 3380fb0e4..81cca4628 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -83,12 +83,6 @@ jobs:
           reconcile: "true"
           delete: "false"
 
-      # Gate on a quiet steady state: after a successful reconcile the cluster
-      # should stop emitting Warning events. Catches crash loops / probe
-      # failures / back-off that bootstrap warnings would otherwise mask.
-      - name: 🔎 Require no new event warnings
-        uses: ./.github/actions/check-event-warnings
-
       - name: 🩺 Diagnose Flux on failure
         if: failure()
         run: |
@@ -294,14 +288,6 @@ jobs:
           GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
           HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
 
-      # After a successful prod reconcile, require the live cluster to stop
-      # emitting Warning events. Pinned to the same admin@prod context the rest
-      # of the prod steps use.
-      - name: 🔎 Require no new event warnings
-        uses: ./.github/actions/check-event-warnings
-        with:
-          context: admin@prod
-
       - name: 🩺 Diagnose Flux on failure
         if: failure() && steps.reconcile.outcome == 'failure'
         env: