ykstorm · ykstorm · Jun 22, 2026 · Jun 19, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/Makefile b/Makefile
@@ -1,69 +1,39 @@
-.PHONY: up down smoke lint rollout-status help
+.PHONY: up down smoke lint rollout-status demo-image help
 
 KIND_CLUSTER := stackup
-HELM_CHART := helm/buyerchat
+HELM_CHART := helm/demo
 NAMESPACE := app
+DEMO_IMAGE := stackup-demo:v1
+ROLLOUT := demo
 
 help:
 	@echo "stackup Makefile"
 	@echo ""
-	@echo "  make up             Full bring-up: create kind cluster + install all platform components + buyerchat"
+	@echo "  make up             Full bring-up: scripts/bootstrap.sh (ordered, each step waited)"
 	@echo "  make down           Tear down: delete kind cluster (clean)"
-	@echo "  make smoke          Run smoke tests (requires cluster up)"
+	@echo "  make demo-image     Build the demo workload image + side-load it into kind"
+	@echo "  make smoke          Run smoke tests (helm render + validate; no cluster needed)"
 	@echo "  make lint           Lint all YAML files + Helm charts"
-	@echo "  make rollout-status Watch the buyerchat Argo Rollout canary progress"
+	@echo "  make rollout-status Watch the demo Argo Rollout canary progress"
 	@echo ""
-	@echo "Prerequisites: docker, kind, helm >=3.15, kubectl, git"
-
+	@echo "Prerequisites: docker, kind, helm >=3.15, kubectl, git, bash"
+
+# `up` is a thin wrapper over scripts/bootstrap.sh. The script owns the
+# ordering + per-step `kubectl wait` gates (kind -> Calico -> namespace ->
+# sealed-secrets -> SealedSecrets -> ingress/cert-manager/prometheus ->
+# Argo Rollouts/ArgoCD -> demo workload -> app-of-apps). Keeping the
+# orchestration in one place (not split between this target and the
+# script) is why the target is a one-liner.
 up:
-	@echo "=== Creating kind cluster ==="
-	kind create cluster --name $(KIND_CLUSTER) --config kind/cluster.yaml
-
-	@echo "=== Installing Calico CNI ==="
-	kubectl apply -f kind/calico/
-
-	@echo "=== Waiting for CNI ==="
-	@kubectl wait --for=condition=Ready pods -n calico-system -l k8s-app=calico-node --timeout=120s || true
-
-	@echo "=== Installing platform Helm charts ==="
-	@for chart in infra/ingress-nginx infra/cert-manager infra/sealed-secrets infra/kube-prometheus-stack; do \
-		echo "  Installing $$chart..."; \
-		helm upgrade --install --create-namespace --namespace $$(basename $$chart) $$chart $$chart --timeout 120s --wait --debug 2>&1 | tail -3 || true; \
-	done
-
-	@echo "=== Installing Argo Rollouts + ArgoCD ==="
-	@# Wrapper charts (Chart.yaml dependency on the upstream chart) — pull
-	@# the pinned dependency, then install. argo-rollouts first so the
-	@# Rollout CRDs exist before buyerchat renders a Rollout; argocd last.
-	@for chart in infra/argo-rollouts infra/argocd; do \
-		echo "  Installing $$chart..."; \
-		helm dependency build $$chart >/dev/null 2>&1 || true; \
-		helm upgrade --install --create-namespace --namespace $$(basename $$chart) $$chart $$chart --timeout 300s --wait --debug 2>&1 | tail -3 || true; \
-	done
+	@bash scripts/bootstrap.sh
 
-	@echo "=== Installing buyerchat Helm chart ==="
-	helm upgrade --install buyerchat $(HELM_CHART) \
-		--namespace $(NAMESPACE) --create-namespace \
-		--values $(HELM_CHART)/values.dev.yaml \
-		--timeout 180s --wait
-
-	@echo "=== Registering the ArgoCD app-of-apps root ==="
-	@# From here on ArgoCD reconciles every component from git (automated
-	@# sync + prune + self-heal). The helm installs above bootstrap the
-	@# cluster on a clean machine; root-app.yaml is the GitOps takeover.
-	kubectl apply -f argocd/root-app.yaml
-
-	@echo ""
-	@echo "=== Cluster ready ==="
-	@kubectl get pods -A --no-headers | grep -v Running | grep -v Completed && echo "All pods running ✓" || true
-	@echo ""
-	@echo "Add to /etc/hosts:"
-	@echo "  127.0.0.1 buyerchat.local.stackup.dev"
-	@echo "  127.0.0.1 grafana.local.stackup.dev"
-	@echo "  127.0.0.1 argocd.local.stackup.dev"
-	@echo "  127.0.0.1 prometheus.local.stackup.dev"
-	@echo ""
-	@echo "Then: curl https://buyerchat.local.stackup.dev/api/healthcheck"
+# Build + side-load the demo image without a full bring-up. Handy when
+# iterating on the workload, or to stage a "bad" image for the rollback
+# demo: make demo-image DEMO_IMAGE=stackup-demo:v2 then rebuild with
+# --build-arg FAILURE_RATE=0.3 (see apps/demo/Dockerfile).
+demo-image:
+	docker build -t $(DEMO_IMAGE) apps/demo
+	kind load docker-image $(DEMO_IMAGE) --name $(KIND_CLUSTER)
 
 down:
 	@echo "=== Deleting kind cluster ==="
@@ -82,8 +52,10 @@ lint:
 
 	@echo ""
 	@echo "=== Helm lint ==="
-	@helm lint $(HELM_CHART) --quiet && echo "✓ helm lint passed" || echo "✗ helm lint failed"
-	@helm template buyerchat $(HELM_CHART) > /dev/null 2>&1 && echo "✓ helm template passed" || echo "✗ helm template failed"
+	@for chart in helm/demo helm/buyerchat; do \
+		helm lint $$chart --quiet && echo "✓ helm lint $$chart passed" || echo "✗ helm lint $$chart failed"; \
+		helm template $$(basename $$chart) $$chart > /dev/null 2>&1 && echo "✓ helm template $$chart passed" || echo "✗ helm template $$chart failed"; \
+	done
 
 rollout-status:
-	kubectl argo rollouts get rollout buyerchat -n $(NAMESPACE) --watch
+	kubectl argo rollouts get rollout $(ROLLOUT) -n $(NAMESPACE) --watch
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Stackup
 
-**Kubernetes on your laptop. ArgoCD + Argo Rollouts + Prometheus + Grafana. `make up` in 10 minutes. Free.**
+**Kubernetes on your laptop. ArgoCD + Argo Rollouts + Prometheus + Grafana. `make up` in ~12–15 minutes. Free.**
 
 [![CI](https://github.com/ykstorm/stackup/actions/workflows/ci.yml/badge.svg)](https://github.com/ykstorm/stackup/actions/workflows/ci.yml)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
@@ -13,23 +13,23 @@ Managed Kubernetes costs $200+/month minimum on cloud providers. Stackup runs th
 
 What "full production stack" means: a real ArgoCD app-of-apps with 6 child applications, Argo Rollouts canary progressive delivery, Prometheus + Grafana observability, cert-manager TLS, Sealed Secrets encrypted in git, Calico NetworkPolicy enforcement, and Pod Security Standards `restricted` on every workload namespace.
 
-The buyerchat workload deliberately runs degraded (no DB). That's intentional. The cluster is the demo — not the app.
+The bootstrapped canary subject is a small `demo` service (Express + prom-client). The cluster is the point — not the app.
 
 ---
 
 ## What's in the box
 
 | Layer | Component | What it does |
 |---|---|---|
-| **Cluster** | kind on Docker | 3-node K8s in containers |
+| **Cluster** | kind on Docker | single-node K8s in containers |
 | **CNI** | Calico | NetworkPolicy enforcement |
 | **GitOps** | ArgoCD (app-of-apps) | One root app manages 6 children; automated sync + prune + self-heal |
-| **Progressive delivery** | Argo Rollouts | Canary 25→50→75→100%, analysis gate at 25% with auto-rollback |
+| **Progressive delivery** | Argo Rollouts | Canary 25→50→75→100%, success-rate analysis gate at 25% with auto-rollback |
 | **Ingress** | ingress-nginx | TLS termination, hostPort 80/443 |
 | **TLS** | cert-manager | Self-signed ClusterIssuer (swap to ACME in one line for prod) |
 | **Secrets** | Sealed Secrets | Encrypted secrets in git, decrypted in-cluster |
-| **Metrics** | kube-prometheus-stack | Prometheus + Alertmanager + Grafana, RED dashboards pre-imported |
-| **Workload demo** | buyerchat Helm chart | Next.js app — demonstrates the cluster, not a production app |
+| **Metrics** | kube-prometheus-stack | Prometheus + Alertmanager + Grafana |
+| **Workload demo** | demo Helm chart (helm/demo) | Express service that exports `http_requests_total` — the canary subject |
 | **Hardening** | PSS `restricted` + NetworkPolicy `default-deny` | Zero-trust on workload namespaces |
 
 ### Roadmap (not installed yet)
@@ -41,53 +41,50 @@ The buyerchat workload deliberately runs degraded (no DB). That's intentional. T
 
 ---
 
-## 10-minute quickstart
+## Quickstart
+
+**Prerequisites:** Docker, `kind`, `kubectl`, `helm`. Give Docker **at least 6 GB of memory** (Docker Desktop → Settings → Resources). The full stack — Calico, kube-prometheus-stack, ArgoCD, and Argo Rollouts on one node — will start to crash-loop its controllers below ~4 GB.
 
 ```bash
 git clone https://github.com/ykstorm/stackup && cd stackup
 make up
 ```
 
-Add to `/etc/hosts` (Windows: `C:\Windows\System32\drivers\etc\hosts`):
+The ingress hosts use `localtest.me`, which resolves to `127.0.0.1` — no `/etc/hosts` editing. Open:
 
-```
-127.0.0.1 buyerchat.local.stackup.dev
-127.0.0.1 grafana.local.stackup.dev
-127.0.0.1 argocd.local.stackup.dev
-127.0.0.1 prometheus.local.stackup.dev
-```
+- **[https://grafana.localtest.me](https://grafana.localtest.me)** — RED metrics from Prometheus (logs/traces are roadmap)
+- **[https://argocd.localtest.me](https://argocd.localtest.me)** — GitOps tree of 6 child apps
 
-Then open:
+The `demo` workload has no ingress. Reach it by port-forward:
 
-- **[https://buyerchat.local.stackup.dev](https://buyerchat.local.stackup.dev)** — workload, returns 503 degraded (no DB — expected)
-- **[https://grafana.local.stackup.dev](https://grafana.local.stackup.dev)** — RED metrics from Prometheus (logs/traces are roadmap)
-- **[https://argocd.local.stackup.dev](https://argocd.local.stackup.dev)** — GitOps tree of 6 child apps
+```bash
+kubectl -n app port-forward svc/demo 3000:3000
+curl localhost:3000/metrics   # shows http_requests_total
+```
 
 ---
 
 ## What it actually shows you
 
-Push a commit that bumps `helm/buyerchat/values.yaml` image.tag. ArgoCD notices and syncs. Argo Rollouts applies the new Rollout revision. Watch it advance:
+Push a commit that bumps `helm/demo/values.yaml` image.tag. ArgoCD notices and syncs. Argo Rollouts applies the new Rollout revision. Watch it advance:
 
 ```bash
 make rollout-status
-# same as: kubectl argo rollouts get rollout buyerchat -n app --watch
+# same as: kubectl argo rollouts get rollout demo -n app --watch
 ```
 
-The canary shifts 25% of traffic to the new version, pauses, then runs an analysis step: an `AnalysisTemplate` queries Prometheus three times over 90 seconds. If the success condition holds, the rollout advances to 50%, then 75%, then 100%. If the analysis fails, Argo Rollouts aborts and rolls back to the previous revision. This is the canary pattern teams run in production, on your laptop, for free.
+The canary shifts 25% of traffic to the new version, pauses, then runs an analysis step. The `AnalysisTemplate` queries Prometheus for the 2xx HTTP success-rate ratio over a 2-minute window — `sum(rate(http_requests_total{code=~"2.."}[2m])) / sum(rate(http_requests_total[2m]))`. If the result holds at or above 0.95, the rollout advances to 50%, then 75%, then 100%. If it drops below, Argo Rollouts aborts and rolls back to the previous revision. This is the canary pattern teams run in production, on your laptop, for free.
 
-The current analysis query is a conservative liveness check (is the canary up and being scraped). Once the buyerchat image exports request counters on `/api/metrics`, swap it for a real success-rate ratio — the template carries a `TODO` marking the one line to change.
+The `demo` image exports `http_requests_total` directly (Express + prom-client), so the gate runs against real request data. Set `failureRate` on the chart to push a deliberately bad canary and watch the rollback fire.
 
 ---
 
 ## Architecture
 
 ```mermaid
 graph TD
-    Dev[Developer machine] -->|kind create cluster| Kind[kind cluster<br/>3 Docker nodes]
+    Dev[Developer machine] -->|kind create cluster| Kind[kind cluster<br/>single node]
     Kind --> CP[Control plane]
-    Kind --> W1[Worker 1]
-    Kind --> W2[Worker 2]
     CP --> Argo[ArgoCD]
     Argo --> Apps[6 child apps]
     Apps --> Rollout[Argo Rollouts CRD]
@@ -106,11 +103,11 @@ A static documentation site (overview, getting started, architecture, GitOps + c
 
 ```bash
 make help     # Show all targets
-make up             # Full bring-up: create cluster + install platform + buyerchat
+make up             # Full bring-up: create cluster + install platform + demo
 make down           # Tear down kind cluster (clean)
 make smoke          # Run smoke tests (requires cluster up)
 make lint           # Lint all YAML + Helm charts
-make rollout-status # Watch the buyerchat Argo Rollout canary progress
+make rollout-status # Watch the demo Argo Rollout canary progress
 ```
 
 ---
@@ -120,7 +117,7 @@ make rollout-status # Watch the buyerchat Argo Rollout canary progress
 - No real LoadBalancer service type (kind doesn't ship one). We use hostPort. For real LB, deploy to a cloud cluster.
 - Storage is local-path PVs by default. Re-creating the cluster wipes them. Add Longhorn or OpenEBS if you need persistence across teardowns.
 - Single-tenant workload namespace. Multi-tenant needs additional NetworkPolicy and RBAC work (PRs welcome).
-- The buyerchat workload runs degraded (no DB). That's intentional — the cluster is the demo, not the app.
+- The `demo` workload is a stand-in for your real service — it exists to drive the canary, not to be a product. (A legacy `buyerchat` chart still lives in `helm/buyerchat` as an example; it is not what `make up` deploys.)
 
 ## License
 

diff --git a/apps/demo/.dockerignore b/apps/demo/.dockerignore
@@ -0,0 +1,4 @@
+node_modules
+npm-debug.log
+test
+.dockerignore
diff --git a/apps/demo/Dockerfile b/apps/demo/Dockerfile
@@ -0,0 +1,35 @@
+# Demo workload image — the stackup canary subject.
+#
+# Build (from repo root):
+#   docker build -t stackup-demo:v1 apps/demo
+#
+# Load into the kind cluster (image is never pushed to a registry; kind
+# side-loads it so ImagePullPolicy=IfNotPresent resolves locally):
+#   kind load docker-image stackup-demo:v1 --name stackup
+#
+# A "bad" build for the rollback demo just bakes a higher failure rate:
+#   docker build -t stackup-demo:v2 --build-arg FAILURE_RATE=0.3 apps/demo
+#   kind load docker-image stackup-demo:v2 --name stackup
+FROM node:20-alpine
+
+WORKDIR /app
+
+# Install production deps first for layer caching.
+COPY package*.json ./
+RUN npm install --omit=dev
+
+COPY server.js ./
+
+# Default failure rate baked into the image; override per-build for the
+# rollback demo, or at runtime via the FAILURE_RATE env var.
+ARG FAILURE_RATE=0
+ENV FAILURE_RATE=${FAILURE_RATE}
+ENV PORT=3000
+ENV SERVICE_NAME=demo
+
+# Run as a non-root UID so the pod satisfies restricted Pod Security
+# Standards (runAsNonRoot + runAsUser in the chart's securityContext).
+USER 1001
+
+EXPOSE 3000
+CMD ["node", "server.js"]