Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion .github/workflows/41-railway-setup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,45 @@ jobs:
chmod +x hosting/railway/oss/scripts/*.sh
# shellcheck source=/dev/null
source hosting/railway/oss/scripts/preview-resolve-env.sh
hosting/railway/oss/scripts/bootstrap.sh

# Persist the full bootstrap output so the "Upload setup log" step can
# publish it as an artifact, regardless of live-log truncation.
log_file="${GITHUB_WORKSPACE:-$PWD}/railway-setup-${PR_NUMBER:-unknown}.log"

set +e
hosting/railway/oss/scripts/bootstrap.sh 2>&1 | tee "$log_file"
setup_status=${PIPESTATUS[0]}
set -e

echo "project_name=${RAILWAY_PROJECT_NAME}" >> "$GITHUB_OUTPUT"
echo "environment_name=${RAILWAY_ENVIRONMENT_NAME}" >> "$GITHUB_OUTPUT"

if [ "$setup_status" -ne 0 ]; then
{
echo "### Railway Preview Setup — Failed"
echo
echo "<details><summary>Setup log (last 100 lines)</summary>"
echo
echo '```'
tail -n 100 "$log_file" 2>/dev/null
echo '```'
echo "</details>"
} >> "$GITHUB_STEP_SUMMARY"
exit "$setup_status"
fi

- name: Upload setup log
if: always()
# Diagnostics only: a failed/duplicate upload must never fail the job.
continue-on-error: true
uses: actions/upload-artifact@v4
with:
name: railway-setup-log-${{ inputs.pr_number }}
path: railway-setup-*.log
if-no-files-found: ignore
overwrite: true
retention-days: 7

- name: Summary
run: |
{
Expand Down
58 changes: 51 additions & 7 deletions .github/workflows/43-railway-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,10 @@ jobs:
# shellcheck source=/dev/null
source hosting/railway/oss/scripts/preview-resolve-env.sh

log_file="$(mktemp)"
cleanup() {
rm -f "$log_file"
}
trap cleanup EXIT
# Keep the log in the workspace so the "Upload deploy log" step can
# publish it as an artifact. GitHub's live log can truncate streamed
# output, so we always persist a full copy.
log_file="${GITHUB_WORKSPACE:-$PWD}/railway-deploy-${PR_NUMBER:-unknown}.log"

project="$RAILWAY_PROJECT_NAME"
environment_name="$RAILWAY_ENVIRONMENT_NAME"
Expand Down Expand Up @@ -177,13 +176,58 @@ jobs:
echo "environment_name=${environment_name}" >> "$GITHUB_OUTPUT"
echo "railway_logs_url=${railway_logs_url}" >> "$GITHUB_OUTPUT"

trap - EXIT
cleanup
# Best-effort diagnostics; never let these change the step outcome.
set +e
# On failure, pull the tail of the key services' Railway logs into
# this job so the root cause (e.g. a Postgres crash-loop) is visible
# here instead of only in the Railway dashboard.
if [ "$deploy_failed" = "true" ]; then
# Tee into the persisted log so the uploaded artifact and the
# step-summary tail include the Railway service logs too, not just
# the (possibly truncated) live Actions log.
dump_railway_logs 2>&1 | tee -a "$log_file"
fi

status_label="Deployed"
[ "$deploy_failed" = "true" ] && status_label="Failed"
{
echo "### Railway Preview Deploy"
echo
echo "| Item | Value |"
echo "| --- | --- |"
echo "| PR | \`${PR_NUMBER}\` |"
echo "| Image tag | \`${IMAGE_TAG}\` |"
echo "| Status | ${status_label} |"
[ -n "$url" ] && echo "| Preview URL | ${url} |"
[ -n "$railway_logs_url" ] && echo "| Railway logs | [Open logs](${railway_logs_url}) |"
if [ "$deploy_failed" = "true" ]; then
echo
echo "<details><summary>Deploy log (last 100 lines)</summary>"
echo
echo '```'
tail -n 100 "$log_file" 2>/dev/null
echo '```'
echo "</details>"
fi
} >> "$GITHUB_STEP_SUMMARY"
set -e

if [ "$deploy_failed" = "true" ]; then
exit 1
fi

- name: Upload deploy log
if: always()
# Diagnostics only: a failed/duplicate upload must never fail the job.
continue-on-error: true
uses: actions/upload-artifact@v4
with:
name: railway-deploy-log-${{ inputs.pr_number }}
path: railway-deploy-*.log
if-no-files-found: ignore
overwrite: true
retention-days: 7

- name: Post preview URL as PR comment
if: inputs.pr_number != '' && steps.deploy.outputs.preview_url != ''
uses: actions/github-script@v7
Expand Down
2 changes: 2 additions & 0 deletions hosting/railway/oss/scripts/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
# shellcheck source=lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"

install_error_trap

PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}"
ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}"
SOURCE_COMPOSE_FILE="${RAILWAY_SOURCE_COMPOSE_FILE:-$(railway_source_compose_file "$ROOT_DIR")}"
Expand Down
2 changes: 2 additions & 0 deletions hosting/railway/oss/scripts/configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ set -euo pipefail
# shellcheck source=lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"

install_error_trap

PROJECT_NAME="${RAILWAY_PROJECT_NAME:-agenta-oss-railway}"
ENV_NAME="${RAILWAY_ENVIRONMENT_NAME:-staging}"
SKIP_UNSETS="${CONFIGURE_SKIP_UNSETS:-false}"
Expand Down
2 changes: 2 additions & 0 deletions hosting/railway/oss/scripts/deploy-from-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ TMP_DIR="$(mktemp -d)"
# shellcheck source=lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"

install_error_trap

cleanup() {
rm -rf "$TMP_DIR"
}
Expand Down
178 changes: 157 additions & 21 deletions hosting/railway/oss/scripts/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,45 +91,181 @@ require_compose_redis_image() {
printf "%s" "$image"
}

# railway_call: Run a railway CLI command, retrying on rate-limit responses.
# _railway_redact: Mask secret values before they are logged. Reads stdin and
# writes redacted text to stdout. Masks the value of any KEY=VALUE token whose
# (uppercase) key contains PASSWORD/TOKEN/SECRET/KEY, plus the password segment
# of any scheme://user:password@host URL. Keeps diagnostic output safe to print
# and to upload as a CI artifact even when a failing command echoes its args.
#
# Railway returns "You are being ratelimited. Please try again later" on
# HTTP 429. This wrapper detects that message and backs off with exponential
# backoff before retrying.
# Applied only to failure/diagnostic output, never to the success path that
# callers parse (e.g. `variable list -k` results).
_railway_redact() {
sed -E \
-e 's/([A-Z0-9_]*(PASSWORD|TOKEN|SECRET|KEY)[A-Z0-9_]*[[:space:]]*=[[:space:]]*)[^[:space:]]+/\1***REDACTED***/g' \
-e 's#(://[A-Za-z0-9._~-]+:)[^@[:space:]/]+@#\1***REDACTED***@#g'
}

# railway_call: Run a railway CLI command with smart, context-aware retries.
#
# Usage:
# railway_call [railway args...]
# Railway's GraphQL API (backboard.railway.com) intermittently rate-limits
# (HTTP 429) and, far more often for us, times out write mutations such as
# `variable set` ("operation timed out" / "error sending request"). This
# wrapper retries the cases that are safe + useful to retry, and fails fast on
# the rest:
#
# Environment variables:
# RAILWAY_RETRY_MAX Max retry attempts (default: 5)
# RAILWAY_RETRY_DELAY Initial delay in seconds (default: 10)
# - rate-limit -> always retried. A 429 is a clean rejection: the
# request was not processed, so a retry cannot
# duplicate work.
# - transient network -> retried ONLY for idempotent commands. A timed-out
# (timeout/5xx/reset) mutation may have already succeeded server-side, so
# we do NOT blind-retry non-idempotent creates
# (`init`, `add`, `environment new`, `volume add`):
# that risks duplicate projects/services/volumes.
# `variable set` (our actual offender) is an
# idempotent upsert, so it is retried.
# - anything else -> not retried. Deterministic errors ("not found",
# "unauthorized", "No service linked") fail fast
# instead of burning the backoff budget.
#
# Backoff is exponential. Failure output is redacted (see _railway_redact).
#
# Environment variables:
# RAILWAY_RETRY_MAX Max attempts (default: 5)
# RAILWAY_RETRY_DELAY Initial backoff delay in seconds (default: 10)
railway_call() {
local max_attempts="${RAILWAY_RETRY_MAX:-5}"
local delay="${RAILWAY_RETRY_DELAY:-10}"
local attempt=1
local output
local exit_code

# Is this a non-idempotent create? If so, an ambiguous timeout must not be
# blind-retried (the resource may already exist). Rate-limit retries are
# still safe because a 429 is rejected before any work is done.
local idempotent=1
case "$1" in
init | add) idempotent=0 ;;
environment) [ "${2:-}" = "new" ] && idempotent=0 ;;
volume) [ "${2:-}" = "add" ] && idempotent=0 ;;
esac

while [ "$attempt" -le "$max_attempts" ]; do
output="$(railway "$@" 2>&1)" && exit_code=0 || exit_code=$?

if printf "%s" "$output" | grep -qi "ratelimit\|rate.limit\|rate limit"; then
if [ "$attempt" -eq "$max_attempts" ]; then
printf "railway %s: rate-limited after %d attempts\n" "$*" "$max_attempts" >&2
printf "%s\n" "$output" >&2
return 1
fi
printf "railway %s: rate-limited, retrying in %ds (attempt %d/%d)\n" \
"$1" "$delay" "$attempt" "$max_attempts" >&2
# `set +Ee` inside the subshell so a non-zero exit from `railway`
# neither trips errexit nor fires an inherited ERR trap. This wrapper
# handles retries and reports failures itself.
output="$(set +Ee; railway "$@" 2>&1)" && exit_code=0 || exit_code=$?

# Success: emit on stdout so callers can capture the output.
if [ "$exit_code" -eq 0 ]; then
printf "%s\n" "$output"
return 0
fi

# Classify the failure to decide whether a retry is safe + useful.
local kind="error"
if printf "%s" "$output" | grep -qiE "ratelimit|rate.?limit"; then
kind="rate-limit"
elif printf "%s" "$output" | grep -qiE "timed out|error sending request|failed to fetch|error trying to connect|connection (reset|refused|closed|error)|temporarily unavailable|service unavailable|bad gateway|gateway time-?out|broken pipe|unexpected eof|tls handshake"; then
kind="transient"
fi

local retryable=0
case "$kind" in
rate-limit) retryable=1 ;;
transient) [ "$idempotent" -eq 1 ] && retryable=1 ;;
esac

if [ "$retryable" -eq 1 ] && [ "$attempt" -lt "$max_attempts" ]; then
printf "railway %s: %s, retrying in %ds (attempt %d/%d)\n" \
"$1" "$kind" "$delay" "$attempt" "$max_attempts" >&2
sleep "$delay"
delay=$((delay * 2))
attempt=$((attempt + 1))
continue
fi

# Not a rate-limit error. Print output and return the original exit code.
printf "%s\n" "$output"
# Give up: not retryable, attempts exhausted, or an ambiguous timeout
# on a non-idempotent create. Surface the (redacted) railway error.
if [ "$retryable" -eq 1 ]; then
printf "railway %s: %s — giving up after %d attempts\n" \
"$1" "$kind" "$attempt" >&2
elif [ "$kind" = "transient" ] && [ "$idempotent" -eq 0 ]; then
printf "railway %s: %s on non-idempotent command — not retrying (may have partially succeeded)\n" \
"$1" "$kind" >&2
fi
[ -n "$output" ] && printf "%s\n" "$output" | _railway_redact >&2
return "$exit_code"
done
}

# install_error_trap: Turn a bare "exit 1" into a diagnostic that names the
# failing command and prints a short call stack. Call once near the top of a
# script, after sourcing this file. Enables errtrace (set -E) so the trap also
# fires for failures inside functions.
#
# railway_call disables errtrace inside its own command substitution, so
# tolerated failures (callers using `|| true`) do not reach this trap.
_railway_on_error() {
# The same failure can unwind through several stack frames; report once.
[ -n "${_RAILWAY_ERR_HANDLED:-}" ] && return 0
_RAILWAY_ERR_HANDLED=1

local exit_code="$1"
# Redact secrets: $BASH_COMMAND can contain secret-bearing args (e.g.
# `railway variable set ... AGENTA_AUTH_KEY=...`).
local cmd
cmd="$(printf '%s' "$2" | _railway_redact)"

printf '\n[railway][FAIL] command failed (exit %s): %s\n' "$exit_code" "$cmd" >&2

local i
for ((i = 1; i < ${#FUNCNAME[@]}; i++)); do
printf ' at %s (%s:%s)\n' \
"${FUNCNAME[i]}" "${BASH_SOURCE[i]}" "${BASH_LINENO[i - 1]}" >&2
done

# Surface a GitHub Actions annotation when running in CI.
if [ "${GITHUB_ACTIONS:-}" = "true" ]; then
printf '::error::[railway] command failed (exit %s): %s\n' "$exit_code" "$cmd" >&2
fi
}

install_error_trap() {
set -E
trap '_railway_on_error "$?" "$BASH_COMMAND"' ERR
}

# dump_railway_logs: Best-effort snapshot of Railway service logs, for CI
# debugging. Uses --lines (non-streaming) and --latest (works even when a
# deployment failed or is crash-looping), wrapped in a hard timeout so it can
# never hang or fail the caller. Requires a linked project/environment.
#
# Usage: dump_railway_logs [service ...] (defaults to the core infra set)
#
# Environment variables:
# RAILWAY_LOG_TAIL Lines to fetch per service (default: 50)
# RAILWAY_LOG_TIMEOUT Per-service timeout in seconds (default: 30)
dump_railway_logs() {
local services=("$@")
if [ "${#services[@]}" -eq 0 ]; then
services=(Postgres redis alembic api supertokens web)
fi

local lines="${RAILWAY_LOG_TAIL:-50}"
local timeout_s="${RAILWAY_LOG_TIMEOUT:-30}"
local svc
local logs

for svc in "${services[@]}"; do
printf '\n===== railway logs (last %s lines): %s =====\n' "$lines" "$svc" >&2
# Capture first so the exit status reflects railway/timeout (not the
# redactor), then print redacted (service logs may embed DB URIs).
if logs="$(timeout "$timeout_s" railway logs --service "$svc" --lines "$lines" --latest 2>&1)"; then
printf '%s\n' "$logs" | _railway_redact >&2
else
printf '(no logs available for service: %s)\n' "$svc" >&2
fi
done

return 0
}
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ import {
newTestcaseCountAtom,
newTestcaseDataHashAtom,
} from "../execution/selectors"
import {pruneDanglingConnections} from "../helpers/connectionGraph"
import {extractAndLoadChatMessagesAtom} from "../helpers/extractAndLoadChatMessages"
import {normalizeTestcaseRowsForLoad} from "../helpers/testcaseRowNormalization"
import type {EntitySelection, PlaygroundNode, RunnableType} from "../types"
Expand Down Expand Up @@ -412,8 +413,17 @@ const changePrimaryNodeAtom = atom(null, (get, set, entity: EntitySelection) =>
label: entity.label,
}

set(playgroundNodesAtom, [updatedNode, ...nodes.slice(1)])
set(outputConnectionsAtom, [])
const nextNodes = [updatedNode, ...nodes.slice(1)]
set(playgroundNodesAtom, nextNodes)

// Preserve downstream connections instead of clearing them. The primary
// node is updated in place, so its `id` is unchanged and any chain sourced
// from it (e.g. app → evaluator) stays valid. Clearing unconditionally
// orphaned the downstream evaluator on app-revision re-selection:
// connectDownstreamNode then no-ops because the evaluator node is still
// present, so the edge was never recreated and the evaluator silently
// stopped running. Only drop connections whose endpoints no longer exist.
set(outputConnectionsAtom, pruneDanglingConnections(get(outputConnectionsAtom), nextNodes))

// Update local testset name if not connected to a remote testset
const currentTestset = get(connectedTestsetAtom)
Expand Down
Loading
Loading