diff --git a/.github/workflows/41-railway-setup.yml b/.github/workflows/41-railway-setup.yml index 80069152b2..a0880a4f48 100644 --- a/.github/workflows/41-railway-setup.yml +++ b/.github/workflows/41-railway-setup.yml @@ -73,10 +73,45 @@ jobs: chmod +x hosting/railway/oss/scripts/*.sh # shellcheck source=/dev/null source hosting/railway/oss/scripts/preview-resolve-env.sh - hosting/railway/oss/scripts/bootstrap.sh + + # Persist the full bootstrap output so the "Upload setup log" step can + # publish it as an artifact, regardless of live-log truncation. + log_file="${GITHUB_WORKSPACE:-$PWD}/railway-setup-${PR_NUMBER:-unknown}.log" + + set +e + hosting/railway/oss/scripts/bootstrap.sh 2>&1 | tee "$log_file" + setup_status=${PIPESTATUS[0]} + set -e + echo "project_name=${RAILWAY_PROJECT_NAME}" >> "$GITHUB_OUTPUT" echo "environment_name=${RAILWAY_ENVIRONMENT_NAME}" >> "$GITHUB_OUTPUT" + if [ "$setup_status" -ne 0 ]; then + { + echo "### Railway Preview Setup — Failed" + echo + echo "
Setup log (last 100 lines)" + echo + echo '```' + tail -n 100 "$log_file" 2>/dev/null + echo '```' + echo "
" + } >> "$GITHUB_STEP_SUMMARY" + exit "$setup_status" + fi + + - name: Upload setup log + if: always() + # Diagnostics only: a failed/duplicate upload must never fail the job. + continue-on-error: true + uses: actions/upload-artifact@v4 + with: + name: railway-setup-log-${{ inputs.pr_number }} + path: railway-setup-*.log + if-no-files-found: ignore + overwrite: true + retention-days: 7 + - name: Summary run: | { diff --git a/.github/workflows/43-railway-deploy.yml b/.github/workflows/43-railway-deploy.yml index 6f2e55e403..2f25f44738 100644 --- a/.github/workflows/43-railway-deploy.yml +++ b/.github/workflows/43-railway-deploy.yml @@ -112,11 +112,10 @@ jobs: # shellcheck source=/dev/null source hosting/railway/oss/scripts/preview-resolve-env.sh - log_file="$(mktemp)" - cleanup() { - rm -f "$log_file" - } - trap cleanup EXIT + # Keep the log in the workspace so the "Upload deploy log" step can + # publish it as an artifact. GitHub's live log can truncate streamed + # output, so we always persist a full copy. + log_file="${GITHUB_WORKSPACE:-$PWD}/railway-deploy-${PR_NUMBER:-unknown}.log" project="$RAILWAY_PROJECT_NAME" environment_name="$RAILWAY_ENVIRONMENT_NAME" @@ -177,13 +176,58 @@ jobs: echo "environment_name=${environment_name}" >> "$GITHUB_OUTPUT" echo "railway_logs_url=${railway_logs_url}" >> "$GITHUB_OUTPUT" - trap - EXIT - cleanup + # Best-effort diagnostics; never let these change the step outcome. + set +e + # On failure, pull the tail of the key services' Railway logs into + # this job so the root cause (e.g. a Postgres crash-loop) is visible + # here instead of only in the Railway dashboard. + if [ "$deploy_failed" = "true" ]; then + # Tee into the persisted log so the uploaded artifact and the + # step-summary tail include the Railway service logs too, not just + # the (possibly truncated) live Actions log. + dump_railway_logs 2>&1 | tee -a "$log_file" + fi + + status_label="Deployed" + [ "$deploy_failed" = "true" ] && status_label="Failed" + { + echo "### Railway Preview Deploy" + echo + echo "| Item | Value |" + echo "| --- | --- |" + echo "| PR | \`${PR_NUMBER}\` |" + echo "| Image tag | \`${IMAGE_TAG}\` |" + echo "| Status | ${status_label} |" + [ -n "$url" ] && echo "| Preview URL | ${url} |" + [ -n "$railway_logs_url" ] && echo "| Railway logs | [Open logs](${railway_logs_url}) |" + if [ "$deploy_failed" = "true" ]; then + echo + echo "
Deploy log (last 100 lines)" + echo + echo '```' + tail -n 100 "$log_file" 2>/dev/null + echo '```' + echo "
" + fi + } >> "$GITHUB_STEP_SUMMARY" + set -e if [ "$deploy_failed" = "true" ]; then exit 1 fi + - name: Upload deploy log + if: always() + # Diagnostics only: a failed/duplicate upload must never fail the job. + continue-on-error: true + uses: actions/upload-artifact@v4 + with: + name: railway-deploy-log-${{ inputs.pr_number }} + path: railway-deploy-*.log + if-no-files-found: ignore + overwrite: true + retention-days: 7 + - name: Post preview URL as PR comment if: inputs.pr_number != '' && steps.deploy.outputs.preview_url != '' uses: actions/github-script@v7 diff --git a/hosting/railway/oss/scripts/bootstrap.sh b/hosting/railway/oss/scripts/bootstrap.sh index cec13fca25..4fce3da2ae 100755 --- a/hosting/railway/oss/scripts/bootstrap.sh +++ b/hosting/railway/oss/scripts/bootstrap.sh @@ -7,6 +7,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)" # shellcheck source=lib.sh source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" +install_error_trap + PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}" ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}" SOURCE_COMPOSE_FILE="${RAILWAY_SOURCE_COMPOSE_FILE:-$(railway_source_compose_file "$ROOT_DIR")}" diff --git a/hosting/railway/oss/scripts/configure.sh b/hosting/railway/oss/scripts/configure.sh index 5285d8e6f0..5a441cfe57 100755 --- a/hosting/railway/oss/scripts/configure.sh +++ b/hosting/railway/oss/scripts/configure.sh @@ -5,6 +5,8 @@ set -euo pipefail # shellcheck source=lib.sh source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" +install_error_trap + PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}" ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}" SKIP_UNSETS="${CONFIGURE_SKIP_UNSETS:-false}" diff --git a/hosting/railway/oss/scripts/deploy-from-images.sh b/hosting/railway/oss/scripts/deploy-from-images.sh index 81a79ff656..400869f4c2 100755 --- a/hosting/railway/oss/scripts/deploy-from-images.sh +++ b/hosting/railway/oss/scripts/deploy-from-images.sh @@ -8,6 +8,8 @@ TMP_DIR="$(mktemp -d)" # shellcheck source=lib.sh source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" +install_error_trap + cleanup() { rm -rf "$TMP_DIR" } diff --git a/hosting/railway/oss/scripts/lib.sh b/hosting/railway/oss/scripts/lib.sh index 73fbcb13b9..88caa39b22 100755 --- a/hosting/railway/oss/scripts/lib.sh +++ b/hosting/railway/oss/scripts/lib.sh @@ -91,19 +91,47 @@ require_compose_redis_image() { printf "%s" "$image" } -# railway_call: Run a railway CLI command, retrying on rate-limit responses. +# _railway_redact: Mask secret values before they are logged. Reads stdin and +# writes redacted text to stdout. Masks the value of any KEY=VALUE token whose +# (uppercase) key contains PASSWORD/TOKEN/SECRET/KEY, plus the password segment +# of any scheme://user:password@host URL. Keeps diagnostic output safe to print +# and to upload as a CI artifact even when a failing command echoes its args. # -# Railway returns "You are being ratelimited. Please try again later" on -# HTTP 429. This wrapper detects that message and backs off with exponential -# backoff before retrying. +# Applied only to failure/diagnostic output, never to the success path that +# callers parse (e.g. `variable list -k` results). +_railway_redact() { + sed -E \ + -e 's/([A-Z0-9_]*(PASSWORD|TOKEN|SECRET|KEY)[A-Z0-9_]*[[:space:]]*=[[:space:]]*)[^[:space:]]+/\1***REDACTED***/g' \ + -e 's#(://[A-Za-z0-9._~-]+:)[^@[:space:]/]+@#\1***REDACTED***@#g' +} + +# railway_call: Run a railway CLI command with smart, context-aware retries. # -# Usage: -# railway_call [railway args...] +# Railway's GraphQL API (backboard.railway.com) intermittently rate-limits +# (HTTP 429) and, far more often for us, times out write mutations such as +# `variable set` ("operation timed out" / "error sending request"). This +# wrapper retries the cases that are safe + useful to retry, and fails fast on +# the rest: # -# Environment variables: -# RAILWAY_RETRY_MAX Max retry attempts (default: 5) -# RAILWAY_RETRY_DELAY Initial delay in seconds (default: 10) +# - rate-limit -> always retried. A 429 is a clean rejection: the +# request was not processed, so a retry cannot +# duplicate work. +# - transient network -> retried ONLY for idempotent commands. A timed-out +# (timeout/5xx/reset) mutation may have already succeeded server-side, so +# we do NOT blind-retry non-idempotent creates +# (`init`, `add`, `environment new`, `volume add`): +# that risks duplicate projects/services/volumes. +# `variable set` (our actual offender) is an +# idempotent upsert, so it is retried. +# - anything else -> not retried. Deterministic errors ("not found", +# "unauthorized", "No service linked") fail fast +# instead of burning the backoff budget. # +# Backoff is exponential. Failure output is redacted (see _railway_redact). +# +# Environment variables: +# RAILWAY_RETRY_MAX Max attempts (default: 5) +# RAILWAY_RETRY_DELAY Initial backoff delay in seconds (default: 10) railway_call() { local max_attempts="${RAILWAY_RETRY_MAX:-5}" local delay="${RAILWAY_RETRY_DELAY:-10}" @@ -111,25 +139,133 @@ railway_call() { local output local exit_code + # Is this a non-idempotent create? If so, an ambiguous timeout must not be + # blind-retried (the resource may already exist). Rate-limit retries are + # still safe because a 429 is rejected before any work is done. + local idempotent=1 + case "$1" in + init | add) idempotent=0 ;; + environment) [ "${2:-}" = "new" ] && idempotent=0 ;; + volume) [ "${2:-}" = "add" ] && idempotent=0 ;; + esac + while [ "$attempt" -le "$max_attempts" ]; do - output="$(railway "$@" 2>&1)" && exit_code=0 || exit_code=$? - - if printf "%s" "$output" | grep -qi "ratelimit\|rate.limit\|rate limit"; then - if [ "$attempt" -eq "$max_attempts" ]; then - printf "railway %s: rate-limited after %d attempts\n" "$*" "$max_attempts" >&2 - printf "%s\n" "$output" >&2 - return 1 - fi - printf "railway %s: rate-limited, retrying in %ds (attempt %d/%d)\n" \ - "$1" "$delay" "$attempt" "$max_attempts" >&2 + # `set +Ee` inside the subshell so a non-zero exit from `railway` + # neither trips errexit nor fires an inherited ERR trap. This wrapper + # handles retries and reports failures itself. + output="$(set +Ee; railway "$@" 2>&1)" && exit_code=0 || exit_code=$? + + # Success: emit on stdout so callers can capture the output. + if [ "$exit_code" -eq 0 ]; then + printf "%s\n" "$output" + return 0 + fi + + # Classify the failure to decide whether a retry is safe + useful. + local kind="error" + if printf "%s" "$output" | grep -qiE "ratelimit|rate.?limit"; then + kind="rate-limit" + elif printf "%s" "$output" | grep -qiE "timed out|error sending request|failed to fetch|error trying to connect|connection (reset|refused|closed|error)|temporarily unavailable|service unavailable|bad gateway|gateway time-?out|broken pipe|unexpected eof|tls handshake"; then + kind="transient" + fi + + local retryable=0 + case "$kind" in + rate-limit) retryable=1 ;; + transient) [ "$idempotent" -eq 1 ] && retryable=1 ;; + esac + + if [ "$retryable" -eq 1 ] && [ "$attempt" -lt "$max_attempts" ]; then + printf "railway %s: %s, retrying in %ds (attempt %d/%d)\n" \ + "$1" "$kind" "$delay" "$attempt" "$max_attempts" >&2 sleep "$delay" delay=$((delay * 2)) attempt=$((attempt + 1)) continue fi - # Not a rate-limit error. Print output and return the original exit code. - printf "%s\n" "$output" + # Give up: not retryable, attempts exhausted, or an ambiguous timeout + # on a non-idempotent create. Surface the (redacted) railway error. + if [ "$retryable" -eq 1 ]; then + printf "railway %s: %s — giving up after %d attempts\n" \ + "$1" "$kind" "$attempt" >&2 + elif [ "$kind" = "transient" ] && [ "$idempotent" -eq 0 ]; then + printf "railway %s: %s on non-idempotent command — not retrying (may have partially succeeded)\n" \ + "$1" "$kind" >&2 + fi + [ -n "$output" ] && printf "%s\n" "$output" | _railway_redact >&2 return "$exit_code" done } + +# install_error_trap: Turn a bare "exit 1" into a diagnostic that names the +# failing command and prints a short call stack. Call once near the top of a +# script, after sourcing this file. Enables errtrace (set -E) so the trap also +# fires for failures inside functions. +# +# railway_call disables errtrace inside its own command substitution, so +# tolerated failures (callers using `|| true`) do not reach this trap. +_railway_on_error() { + # The same failure can unwind through several stack frames; report once. + [ -n "${_RAILWAY_ERR_HANDLED:-}" ] && return 0 + _RAILWAY_ERR_HANDLED=1 + + local exit_code="$1" + # Redact secrets: $BASH_COMMAND can contain secret-bearing args (e.g. + # `railway variable set ... AGENTA_AUTH_KEY=...`). + local cmd + cmd="$(printf '%s' "$2" | _railway_redact)" + + printf '\n[railway][FAIL] command failed (exit %s): %s\n' "$exit_code" "$cmd" >&2 + + local i + for ((i = 1; i < ${#FUNCNAME[@]}; i++)); do + printf ' at %s (%s:%s)\n' \ + "${FUNCNAME[i]}" "${BASH_SOURCE[i]}" "${BASH_LINENO[i - 1]}" >&2 + done + + # Surface a GitHub Actions annotation when running in CI. + if [ "${GITHUB_ACTIONS:-}" = "true" ]; then + printf '::error::[railway] command failed (exit %s): %s\n' "$exit_code" "$cmd" >&2 + fi +} + +install_error_trap() { + set -E + trap '_railway_on_error "$?" "$BASH_COMMAND"' ERR +} + +# dump_railway_logs: Best-effort snapshot of Railway service logs, for CI +# debugging. Uses --lines (non-streaming) and --latest (works even when a +# deployment failed or is crash-looping), wrapped in a hard timeout so it can +# never hang or fail the caller. Requires a linked project/environment. +# +# Usage: dump_railway_logs [service ...] (defaults to the core infra set) +# +# Environment variables: +# RAILWAY_LOG_TAIL Lines to fetch per service (default: 50) +# RAILWAY_LOG_TIMEOUT Per-service timeout in seconds (default: 30) +dump_railway_logs() { + local services=("$@") + if [ "${#services[@]}" -eq 0 ]; then + services=(Postgres redis alembic api supertokens web) + fi + + local lines="${RAILWAY_LOG_TAIL:-50}" + local timeout_s="${RAILWAY_LOG_TIMEOUT:-30}" + local svc + local logs + + for svc in "${services[@]}"; do + printf '\n===== railway logs (last %s lines): %s =====\n' "$lines" "$svc" >&2 + # Capture first so the exit status reflects railway/timeout (not the + # redactor), then print redacted (service logs may embed DB URIs). + if logs="$(timeout "$timeout_s" railway logs --service "$svc" --lines "$lines" --latest 2>&1)"; then + printf '%s\n' "$logs" | _railway_redact >&2 + else + printf '(no logs available for service: %s)\n' "$svc" >&2 + fi + done + + return 0 +} diff --git a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts index 7ae637c770..85f25427b3 100644 --- a/web/packages/agenta-playground/src/state/controllers/playgroundController.ts +++ b/web/packages/agenta-playground/src/state/controllers/playgroundController.ts @@ -82,6 +82,7 @@ import { newTestcaseCountAtom, newTestcaseDataHashAtom, } from "../execution/selectors" +import {pruneDanglingConnections} from "../helpers/connectionGraph" import {extractAndLoadChatMessagesAtom} from "../helpers/extractAndLoadChatMessages" import {normalizeTestcaseRowsForLoad} from "../helpers/testcaseRowNormalization" import type {EntitySelection, PlaygroundNode, RunnableType} from "../types" @@ -412,8 +413,17 @@ const changePrimaryNodeAtom = atom(null, (get, set, entity: EntitySelection) => label: entity.label, } - set(playgroundNodesAtom, [updatedNode, ...nodes.slice(1)]) - set(outputConnectionsAtom, []) + const nextNodes = [updatedNode, ...nodes.slice(1)] + set(playgroundNodesAtom, nextNodes) + + // Preserve downstream connections instead of clearing them. The primary + // node is updated in place, so its `id` is unchanged and any chain sourced + // from it (e.g. app → evaluator) stays valid. Clearing unconditionally + // orphaned the downstream evaluator on app-revision re-selection: + // connectDownstreamNode then no-ops because the evaluator node is still + // present, so the edge was never recreated and the evaluator silently + // stopped running. Only drop connections whose endpoints no longer exist. + set(outputConnectionsAtom, pruneDanglingConnections(get(outputConnectionsAtom), nextNodes)) // Update local testset name if not connected to a remote testset const currentTestset = get(connectedTestsetAtom) diff --git a/web/packages/agenta-playground/src/state/helpers/connectionGraph.ts b/web/packages/agenta-playground/src/state/helpers/connectionGraph.ts new file mode 100644 index 0000000000..ded7736f0a --- /dev/null +++ b/web/packages/agenta-playground/src/state/helpers/connectionGraph.ts @@ -0,0 +1,25 @@ +/** + * Connection graph helpers + * + * Pure functions for reasoning about the playground DAG's output connections. + */ + +import type {OutputConnection, PlaygroundNode} from "../types" + +/** + * Keep only connections whose source and target both reference existing nodes. + * + * Used when the primary node is swapped in place (its `id` is preserved), so + * downstream chains sourced from it stay valid and must not be wiped. Any + * connection pointing at a node that no longer exists is dropped. + */ +export function pruneDanglingConnections( + connections: OutputConnection[], + nodes: PlaygroundNode[], +): OutputConnection[] { + const nodeIds = new Set(nodes.map((node) => node.id)) + return connections.filter( + (connection) => + nodeIds.has(connection.sourceNodeId) && nodeIds.has(connection.targetNodeId), + ) +} diff --git a/web/packages/agenta-playground/tests/unit/connectionGraph.test.ts b/web/packages/agenta-playground/tests/unit/connectionGraph.test.ts new file mode 100644 index 0000000000..b26b50d4b9 --- /dev/null +++ b/web/packages/agenta-playground/tests/unit/connectionGraph.test.ts @@ -0,0 +1,55 @@ +import {describe, expect, it} from "vitest" + +import {pruneDanglingConnections} from "../../src/state/helpers/connectionGraph" +import type {OutputConnection, PlaygroundNode} from "../../src/state/types" + +function node(id: string, depth: number): PlaygroundNode { + return {id, entityType: "workflow", entityId: `entity-${id}`, label: id, depth} +} + +function connection(id: string, sourceNodeId: string, targetNodeId: string): OutputConnection { + return { + id, + sourceNodeId, + targetNodeId, + sourceOutputKey: "output", + inputMappings: [], + parallel: true, + } +} + +describe("pruneDanglingConnections", () => { + // Regression: changePrimaryNode swaps the primary node in place (same node + // id), so the app → evaluator edge must survive an app-revision change. + // Previously the controller cleared all connections here, orphaning the + // evaluator so it silently stopped running. + it("preserves the app → evaluator edge when the primary node is swapped in place", () => { + const nodes = [node("N0", 0), node("N1", 1)] + const connections = [connection("conn-1", "N0", "N1")] + + expect(pruneDanglingConnections(connections, nodes)).toEqual(connections) + }) + + it("drops connections whose source node no longer exists", () => { + const nodes = [node("N0", 0), node("N1", 1)] + const connections = [ + connection("conn-1", "N0", "N1"), + connection("conn-stale", "GONE", "N1"), + ] + + expect(pruneDanglingConnections(connections, nodes)).toEqual([ + connection("conn-1", "N0", "N1"), + ]) + }) + + it("drops connections whose target node no longer exists", () => { + const nodes = [node("N0", 0)] + const connections = [connection("conn-1", "N0", "N1")] + + expect(pruneDanglingConnections(connections, nodes)).toEqual([]) + }) + + it("returns an empty array when there are no connections", () => { + expect(pruneDanglingConnections([], [node("N0", 0)])).toEqual([]) + }) +})