diff --git a/.github/workflows/41-railway-setup.yml b/.github/workflows/41-railway-setup.yml
index 80069152b2..a0880a4f48 100644
--- a/.github/workflows/41-railway-setup.yml
+++ b/.github/workflows/41-railway-setup.yml
@@ -73,10 +73,45 @@ jobs:
chmod +x hosting/railway/oss/scripts/*.sh
# shellcheck source=/dev/null
source hosting/railway/oss/scripts/preview-resolve-env.sh
- hosting/railway/oss/scripts/bootstrap.sh
+
+ # Persist the full bootstrap output so the "Upload setup log" step can
+ # publish it as an artifact, regardless of live-log truncation.
+ log_file="${GITHUB_WORKSPACE:-$PWD}/railway-setup-${PR_NUMBER:-unknown}.log"
+
+ set +e
+ hosting/railway/oss/scripts/bootstrap.sh 2>&1 | tee "$log_file"
+ setup_status=${PIPESTATUS[0]}
+ set -e
+
echo "project_name=${RAILWAY_PROJECT_NAME}" >> "$GITHUB_OUTPUT"
echo "environment_name=${RAILWAY_ENVIRONMENT_NAME}" >> "$GITHUB_OUTPUT"
+ if [ "$setup_status" -ne 0 ]; then
+ {
+ echo "### Railway Preview Setup — Failed"
+ echo
+ echo "Setup log (last 100 lines)
"
+ echo
+ echo '```'
+ tail -n 100 "$log_file" 2>/dev/null
+ echo '```'
+ echo " "
+ } >> "$GITHUB_STEP_SUMMARY"
+ exit "$setup_status"
+ fi
+
+ - name: Upload setup log
+ if: always()
+ # Diagnostics only: a failed/duplicate upload must never fail the job.
+ continue-on-error: true
+ uses: actions/upload-artifact@v4
+ with:
+ name: railway-setup-log-${{ inputs.pr_number }}
+ path: railway-setup-*.log
+ if-no-files-found: ignore
+ overwrite: true
+ retention-days: 7
+
- name: Summary
run: |
{
diff --git a/.github/workflows/43-railway-deploy.yml b/.github/workflows/43-railway-deploy.yml
index 6f2e55e403..2f25f44738 100644
--- a/.github/workflows/43-railway-deploy.yml
+++ b/.github/workflows/43-railway-deploy.yml
@@ -112,11 +112,10 @@ jobs:
# shellcheck source=/dev/null
source hosting/railway/oss/scripts/preview-resolve-env.sh
- log_file="$(mktemp)"
- cleanup() {
- rm -f "$log_file"
- }
- trap cleanup EXIT
+ # Keep the log in the workspace so the "Upload deploy log" step can
+ # publish it as an artifact. GitHub's live log can truncate streamed
+ # output, so we always persist a full copy.
+ log_file="${GITHUB_WORKSPACE:-$PWD}/railway-deploy-${PR_NUMBER:-unknown}.log"
project="$RAILWAY_PROJECT_NAME"
environment_name="$RAILWAY_ENVIRONMENT_NAME"
@@ -177,13 +176,58 @@ jobs:
echo "environment_name=${environment_name}" >> "$GITHUB_OUTPUT"
echo "railway_logs_url=${railway_logs_url}" >> "$GITHUB_OUTPUT"
- trap - EXIT
- cleanup
+ # Best-effort diagnostics; never let these change the step outcome.
+ set +e
+ # On failure, pull the tail of the key services' Railway logs into
+ # this job so the root cause (e.g. a Postgres crash-loop) is visible
+ # here instead of only in the Railway dashboard.
+ if [ "$deploy_failed" = "true" ]; then
+ # Tee into the persisted log so the uploaded artifact and the
+ # step-summary tail include the Railway service logs too, not just
+ # the (possibly truncated) live Actions log.
+ dump_railway_logs 2>&1 | tee -a "$log_file"
+ fi
+
+ status_label="Deployed"
+ [ "$deploy_failed" = "true" ] && status_label="Failed"
+ {
+ echo "### Railway Preview Deploy"
+ echo
+ echo "| Item | Value |"
+ echo "| --- | --- |"
+ echo "| PR | \`${PR_NUMBER}\` |"
+ echo "| Image tag | \`${IMAGE_TAG}\` |"
+ echo "| Status | ${status_label} |"
+ [ -n "$url" ] && echo "| Preview URL | ${url} |"
+ [ -n "$railway_logs_url" ] && echo "| Railway logs | [Open logs](${railway_logs_url}) |"
+ if [ "$deploy_failed" = "true" ]; then
+ echo
+ echo "Deploy log (last 100 lines)
"
+ echo
+ echo '```'
+ tail -n 100 "$log_file" 2>/dev/null
+ echo '```'
+ echo " "
+ fi
+ } >> "$GITHUB_STEP_SUMMARY"
+ set -e
if [ "$deploy_failed" = "true" ]; then
exit 1
fi
+ - name: Upload deploy log
+ if: always()
+ # Diagnostics only: a failed/duplicate upload must never fail the job.
+ continue-on-error: true
+ uses: actions/upload-artifact@v4
+ with:
+ name: railway-deploy-log-${{ inputs.pr_number }}
+ path: railway-deploy-*.log
+ if-no-files-found: ignore
+ overwrite: true
+ retention-days: 7
+
- name: Post preview URL as PR comment
if: inputs.pr_number != '' && steps.deploy.outputs.preview_url != ''
uses: actions/github-script@v7
diff --git a/hosting/railway/oss/scripts/bootstrap.sh b/hosting/railway/oss/scripts/bootstrap.sh
index cec13fca25..4fce3da2ae 100755
--- a/hosting/railway/oss/scripts/bootstrap.sh
+++ b/hosting/railway/oss/scripts/bootstrap.sh
@@ -7,6 +7,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
# shellcheck source=lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+install_error_trap
+
PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}"
ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}"
SOURCE_COMPOSE_FILE="${RAILWAY_SOURCE_COMPOSE_FILE:-$(railway_source_compose_file "$ROOT_DIR")}"
diff --git a/hosting/railway/oss/scripts/configure.sh b/hosting/railway/oss/scripts/configure.sh
index 5285d8e6f0..5a441cfe57 100755
--- a/hosting/railway/oss/scripts/configure.sh
+++ b/hosting/railway/oss/scripts/configure.sh
@@ -5,6 +5,8 @@ set -euo pipefail
# shellcheck source=lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+install_error_trap
+
PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}"
ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}"
SKIP_UNSETS="${CONFIGURE_SKIP_UNSETS:-false}"
diff --git a/hosting/railway/oss/scripts/deploy-from-images.sh b/hosting/railway/oss/scripts/deploy-from-images.sh
index 81a79ff656..400869f4c2 100755
--- a/hosting/railway/oss/scripts/deploy-from-images.sh
+++ b/hosting/railway/oss/scripts/deploy-from-images.sh
@@ -8,6 +8,8 @@ TMP_DIR="$(mktemp -d)"
# shellcheck source=lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
+install_error_trap
+
cleanup() {
rm -rf "$TMP_DIR"
}
diff --git a/hosting/railway/oss/scripts/lib.sh b/hosting/railway/oss/scripts/lib.sh
index 73fbcb13b9..88caa39b22 100755
--- a/hosting/railway/oss/scripts/lib.sh
+++ b/hosting/railway/oss/scripts/lib.sh
@@ -91,19 +91,47 @@ require_compose_redis_image() {
printf "%s" "$image"
}
-# railway_call: Run a railway CLI command, retrying on rate-limit responses.
+# _railway_redact: Mask secret values before they are logged. Reads stdin and
+# writes redacted text to stdout. Masks the value of any KEY=VALUE token whose
+# (uppercase) key contains PASSWORD/TOKEN/SECRET/KEY, plus the password segment
+# of any scheme://user:password@host URL. Keeps diagnostic output safe to print
+# and to upload as a CI artifact even when a failing command echoes its args.
#
-# Railway returns "You are being ratelimited. Please try again later" on
-# HTTP 429. This wrapper detects that message and backs off with exponential
-# backoff before retrying.
+# Applied only to failure/diagnostic output, never to the success path that
+# callers parse (e.g. `variable list -k` results).
+_railway_redact() {
+ sed -E \
+ -e 's/([A-Z0-9_]*(PASSWORD|TOKEN|SECRET|KEY)[A-Z0-9_]*[[:space:]]*=[[:space:]]*)[^[:space:]]+/\1***REDACTED***/g' \
+ -e 's#(://[A-Za-z0-9._~-]+:)[^@[:space:]/]+@#\1***REDACTED***@#g'
+}
+
+# railway_call: Run a railway CLI command with smart, context-aware retries.
#
-# Usage:
-# railway_call [railway args...]
+# Railway's GraphQL API (backboard.railway.com) intermittently rate-limits
+# (HTTP 429) and, far more often for us, times out write mutations such as
+# `variable set` ("operation timed out" / "error sending request"). This
+# wrapper retries the cases that are safe + useful to retry, and fails fast on
+# the rest:
#
-# Environment variables:
-# RAILWAY_RETRY_MAX Max retry attempts (default: 5)
-# RAILWAY_RETRY_DELAY Initial delay in seconds (default: 10)
+# - rate-limit -> always retried. A 429 is a clean rejection: the
+# request was not processed, so a retry cannot
+# duplicate work.
+# - transient network -> retried ONLY for idempotent commands. A timed-out
+# (timeout/5xx/reset) mutation may have already succeeded server-side, so
+# we do NOT blind-retry non-idempotent creates
+# (`init`, `add`, `environment new`, `volume add`):
+# that risks duplicate projects/services/volumes.
+# `variable set` (our actual offender) is an
+# idempotent upsert, so it is retried.
+# - anything else -> not retried. Deterministic errors ("not found",
+# "unauthorized", "No service linked") fail fast
+# instead of burning the backoff budget.
#
+# Backoff is exponential. Failure output is redacted (see _railway_redact).
+#
+# Environment variables:
+# RAILWAY_RETRY_MAX Max attempts (default: 5)
+# RAILWAY_RETRY_DELAY Initial backoff delay in seconds (default: 10)
railway_call() {
local max_attempts="${RAILWAY_RETRY_MAX:-5}"
local delay="${RAILWAY_RETRY_DELAY:-10}"
@@ -111,25 +139,133 @@ railway_call() {
local output
local exit_code
+ # Is this a non-idempotent create? If so, an ambiguous timeout must not be
+ # blind-retried (the resource may already exist). Rate-limit retries are
+ # still safe because a 429 is rejected before any work is done.
+ local idempotent=1
+ case "$1" in
+ init | add) idempotent=0 ;;
+ environment) [ "${2:-}" = "new" ] && idempotent=0 ;;
+ volume) [ "${2:-}" = "add" ] && idempotent=0 ;;
+ esac
+
while [ "$attempt" -le "$max_attempts" ]; do
- output="$(railway "$@" 2>&1)" && exit_code=0 || exit_code=$?
-
- if printf "%s" "$output" | grep -qi "ratelimit\|rate.limit\|rate limit"; then
- if [ "$attempt" -eq "$max_attempts" ]; then
- printf "railway %s: rate-limited after %d attempts\n" "$*" "$max_attempts" >&2
- printf "%s\n" "$output" >&2
- return 1
- fi
- printf "railway %s: rate-limited, retrying in %ds (attempt %d/%d)\n" \
- "$1" "$delay" "$attempt" "$max_attempts" >&2
+ # `set +Ee` inside the subshell so a non-zero exit from `railway`
+ # neither trips errexit nor fires an inherited ERR trap. This wrapper
+ # handles retries and reports failures itself.
+ output="$(set +Ee; railway "$@" 2>&1)" && exit_code=0 || exit_code=$?
+
+ # Success: emit on stdout so callers can capture the output.
+ if [ "$exit_code" -eq 0 ]; then
+ printf "%s\n" "$output"
+ return 0
+ fi
+
+ # Classify the failure to decide whether a retry is safe + useful.
+ local kind="error"
+ if printf "%s" "$output" | grep -qiE "ratelimit|rate.?limit"; then
+ kind="rate-limit"
+ elif printf "%s" "$output" | grep -qiE "timed out|error sending request|failed to fetch|error trying to connect|connection (reset|refused|closed|error)|temporarily unavailable|service unavailable|bad gateway|gateway time-?out|broken pipe|unexpected eof|tls handshake"; then
+ kind="transient"
+ fi
+
+ local retryable=0
+ case "$kind" in
+ rate-limit) retryable=1 ;;
+ transient) [ "$idempotent" -eq 1 ] && retryable=1 ;;
+ esac
+
+ if [ "$retryable" -eq 1 ] && [ "$attempt" -lt "$max_attempts" ]; then
+ printf "railway %s: %s, retrying in %ds (attempt %d/%d)\n" \
+ "$1" "$kind" "$delay" "$attempt" "$max_attempts" >&2
sleep "$delay"
delay=$((delay * 2))
attempt=$((attempt + 1))
continue
fi
- # Not a rate-limit error. Print output and return the original exit code.
- printf "%s\n" "$output"
+ # Give up: not retryable, attempts exhausted, or an ambiguous timeout
+ # on a non-idempotent create. Surface the (redacted) railway error.
+ if [ "$retryable" -eq 1 ]; then
+ printf "railway %s: %s — giving up after %d attempts\n" \
+ "$1" "$kind" "$attempt" >&2
+ elif [ "$kind" = "transient" ] && [ "$idempotent" -eq 0 ]; then
+ printf "railway %s: %s on non-idempotent command — not retrying (may have partially succeeded)\n" \
+ "$1" "$kind" >&2
+ fi
+ [ -n "$output" ] && printf "%s\n" "$output" | _railway_redact >&2
return "$exit_code"
done
}
+
+# install_error_trap: Turn a bare "exit 1" into a diagnostic that names the
+# failing command and prints a short call stack. Call once near the top of a
+# script, after sourcing this file. Enables errtrace (set -E) so the trap also
+# fires for failures inside functions.
+#
+# railway_call disables errtrace inside its own command substitution, so
+# tolerated failures (callers using `|| true`) do not reach this trap.
+_railway_on_error() {
+ # The same failure can unwind through several stack frames; report once.
+ [ -n "${_RAILWAY_ERR_HANDLED:-}" ] && return 0
+ _RAILWAY_ERR_HANDLED=1
+
+ local exit_code="$1"
+ # Redact secrets: $BASH_COMMAND can contain secret-bearing args (e.g.
+ # `railway variable set ... AGENTA_AUTH_KEY=...`).
+ local cmd
+ cmd="$(printf '%s' "$2" | _railway_redact)"
+
+ printf '\n[railway][FAIL] command failed (exit %s): %s\n' "$exit_code" "$cmd" >&2
+
+ local i
+ for ((i = 1; i < ${#FUNCNAME[@]}; i++)); do
+ printf ' at %s (%s:%s)\n' \
+ "${FUNCNAME[i]}" "${BASH_SOURCE[i]}" "${BASH_LINENO[i - 1]}" >&2
+ done
+
+ # Surface a GitHub Actions annotation when running in CI.
+ if [ "${GITHUB_ACTIONS:-}" = "true" ]; then
+ printf '::error::[railway] command failed (exit %s): %s\n' "$exit_code" "$cmd" >&2
+ fi
+}
+
+install_error_trap() {
+ set -E
+ trap '_railway_on_error "$?" "$BASH_COMMAND"' ERR
+}
+
+# dump_railway_logs: Best-effort snapshot of Railway service logs, for CI
+# debugging. Uses --lines (non-streaming) and --latest (works even when a
+# deployment failed or is crash-looping), wrapped in a hard timeout so it can
+# never hang or fail the caller. Requires a linked project/environment.
+#
+# Usage: dump_railway_logs [service ...] (defaults to the core infra set)
+#
+# Environment variables:
+# RAILWAY_LOG_TAIL Lines to fetch per service (default: 50)
+# RAILWAY_LOG_TIMEOUT Per-service timeout in seconds (default: 30)
+dump_railway_logs() {
+ local services=("$@")
+ if [ "${#services[@]}" -eq 0 ]; then
+ services=(Postgres redis alembic api supertokens web)
+ fi
+
+ local lines="${RAILWAY_LOG_TAIL:-50}"
+ local timeout_s="${RAILWAY_LOG_TIMEOUT:-30}"
+ local svc
+ local logs
+
+ for svc in "${services[@]}"; do
+ printf '\n===== railway logs (last %s lines): %s =====\n' "$lines" "$svc" >&2
+ # Capture first so the exit status reflects railway/timeout (not the
+ # redactor), then print redacted (service logs may embed DB URIs).
+ if logs="$(timeout "$timeout_s" railway logs --service "$svc" --lines "$lines" --latest 2>&1)"; then
+ printf '%s\n' "$logs" | _railway_redact >&2
+ else
+ printf '(no logs available for service: %s)\n' "$svc" >&2
+ fi
+ done
+
+ return 0
+}
diff --git a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
index 7ae637c770..85f25427b3 100644
--- a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
+++ b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts
@@ -82,6 +82,7 @@ import {
newTestcaseCountAtom,
newTestcaseDataHashAtom,
} from "../execution/selectors"
+import {pruneDanglingConnections} from "../helpers/connectionGraph"
import {extractAndLoadChatMessagesAtom} from "../helpers/extractAndLoadChatMessages"
import {normalizeTestcaseRowsForLoad} from "../helpers/testcaseRowNormalization"
import type {EntitySelection, PlaygroundNode, RunnableType} from "../types"
@@ -412,8 +413,17 @@ const changePrimaryNodeAtom = atom(null, (get, set, entity: EntitySelection) =>
label: entity.label,
}
- set(playgroundNodesAtom, [updatedNode, ...nodes.slice(1)])
- set(outputConnectionsAtom, [])
+ const nextNodes = [updatedNode, ...nodes.slice(1)]
+ set(playgroundNodesAtom, nextNodes)
+
+ // Preserve downstream connections instead of clearing them. The primary
+ // node is updated in place, so its `id` is unchanged and any chain sourced
+ // from it (e.g. app → evaluator) stays valid. Clearing unconditionally
+ // orphaned the downstream evaluator on app-revision re-selection:
+ // connectDownstreamNode then no-ops because the evaluator node is still
+ // present, so the edge was never recreated and the evaluator silently
+ // stopped running. Only drop connections whose endpoints no longer exist.
+ set(outputConnectionsAtom, pruneDanglingConnections(get(outputConnectionsAtom), nextNodes))
// Update local testset name if not connected to a remote testset
const currentTestset = get(connectedTestsetAtom)
diff --git a/web/packages/agenta-playground/src/state/helpers/connectionGraph.ts b/web/packages/agenta-playground/src/state/helpers/connectionGraph.ts
new file mode 100644
index 0000000000..ded7736f0a
--- /dev/null
+++ b/web/packages/agenta-playground/src/state/helpers/connectionGraph.ts
@@ -0,0 +1,25 @@
+/**
+ * Connection graph helpers
+ *
+ * Pure functions for reasoning about the playground DAG's output connections.
+ */
+
+import type {OutputConnection, PlaygroundNode} from "../types"
+
+/**
+ * Keep only connections whose source and target both reference existing nodes.
+ *
+ * Used when the primary node is swapped in place (its `id` is preserved), so
+ * downstream chains sourced from it stay valid and must not be wiped. Any
+ * connection pointing at a node that no longer exists is dropped.
+ */
+export function pruneDanglingConnections(
+ connections: OutputConnection[],
+ nodes: PlaygroundNode[],
+): OutputConnection[] {
+ const nodeIds = new Set(nodes.map((node) => node.id))
+ return connections.filter(
+ (connection) =>
+ nodeIds.has(connection.sourceNodeId) && nodeIds.has(connection.targetNodeId),
+ )
+}
diff --git a/web/packages/agenta-playground/tests/unit/connectionGraph.test.ts b/web/packages/agenta-playground/tests/unit/connectionGraph.test.ts
new file mode 100644
index 0000000000..b26b50d4b9
--- /dev/null
+++ b/web/packages/agenta-playground/tests/unit/connectionGraph.test.ts
@@ -0,0 +1,55 @@
+import {describe, expect, it} from "vitest"
+
+import {pruneDanglingConnections} from "../../src/state/helpers/connectionGraph"
+import type {OutputConnection, PlaygroundNode} from "../../src/state/types"
+
+function node(id: string, depth: number): PlaygroundNode {
+ return {id, entityType: "workflow", entityId: `entity-${id}`, label: id, depth}
+}
+
+function connection(id: string, sourceNodeId: string, targetNodeId: string): OutputConnection {
+ return {
+ id,
+ sourceNodeId,
+ targetNodeId,
+ sourceOutputKey: "output",
+ inputMappings: [],
+ parallel: true,
+ }
+}
+
+describe("pruneDanglingConnections", () => {
+ // Regression: changePrimaryNode swaps the primary node in place (same node
+ // id), so the app → evaluator edge must survive an app-revision change.
+ // Previously the controller cleared all connections here, orphaning the
+ // evaluator so it silently stopped running.
+ it("preserves the app → evaluator edge when the primary node is swapped in place", () => {
+ const nodes = [node("N0", 0), node("N1", 1)]
+ const connections = [connection("conn-1", "N0", "N1")]
+
+ expect(pruneDanglingConnections(connections, nodes)).toEqual(connections)
+ })
+
+ it("drops connections whose source node no longer exists", () => {
+ const nodes = [node("N0", 0), node("N1", 1)]
+ const connections = [
+ connection("conn-1", "N0", "N1"),
+ connection("conn-stale", "GONE", "N1"),
+ ]
+
+ expect(pruneDanglingConnections(connections, nodes)).toEqual([
+ connection("conn-1", "N0", "N1"),
+ ])
+ })
+
+ it("drops connections whose target node no longer exists", () => {
+ const nodes = [node("N0", 0)]
+ const connections = [connection("conn-1", "N0", "N1")]
+
+ expect(pruneDanglingConnections(connections, nodes)).toEqual([])
+ })
+
+ it("returns an empty array when there are no connections", () => {
+ expect(pruneDanglingConnections([], [node("N0", 0)])).toEqual([])
+ })
+})