Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 29 additions & 57 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,69 +1,39 @@
.PHONY: up down smoke lint rollout-status help
.PHONY: up down smoke lint rollout-status demo-image help

KIND_CLUSTER := stackup
HELM_CHART := helm/buyerchat
HELM_CHART := helm/demo
NAMESPACE := app
DEMO_IMAGE := stackup-demo:v1
ROLLOUT := demo

help:
@echo "stackup Makefile"
@echo ""
@echo " make up Full bring-up: create kind cluster + install all platform components + buyerchat"
@echo " make up Full bring-up: scripts/bootstrap.sh (ordered, each step waited)"
@echo " make down Tear down: delete kind cluster (clean)"
@echo " make smoke Run smoke tests (requires cluster up)"
@echo " make demo-image Build the demo workload image + side-load it into kind"
@echo " make smoke Run smoke tests (helm render + validate; no cluster needed)"
@echo " make lint Lint all YAML files + Helm charts"
@echo " make rollout-status Watch the buyerchat Argo Rollout canary progress"
@echo " make rollout-status Watch the demo Argo Rollout canary progress"
@echo ""
@echo "Prerequisites: docker, kind, helm >=3.15, kubectl, git"

@echo "Prerequisites: docker, kind, helm >=3.15, kubectl, git, bash"

# `up` is a thin wrapper over scripts/bootstrap.sh. The script owns the
# ordering + per-step `kubectl wait` gates (kind -> Calico -> namespace ->
# sealed-secrets -> SealedSecrets -> ingress/cert-manager/prometheus ->
# Argo Rollouts/ArgoCD -> demo workload -> app-of-apps). Keeping the
# orchestration in one place (not split between this target and the
# script) is why the target is a one-liner.
up:
@echo "=== Creating kind cluster ==="
kind create cluster --name $(KIND_CLUSTER) --config kind/cluster.yaml

@echo "=== Installing Calico CNI ==="
kubectl apply -f kind/calico/

@echo "=== Waiting for CNI ==="
@kubectl wait --for=condition=Ready pods -n calico-system -l k8s-app=calico-node --timeout=120s || true

@echo "=== Installing platform Helm charts ==="
@for chart in infra/ingress-nginx infra/cert-manager infra/sealed-secrets infra/kube-prometheus-stack; do \
echo " Installing $$chart..."; \
helm upgrade --install --create-namespace --namespace $$(basename $$chart) $$chart $$chart --timeout 120s --wait --debug 2>&1 | tail -3 || true; \
done

@echo "=== Installing Argo Rollouts + ArgoCD ==="
@# Wrapper charts (Chart.yaml dependency on the upstream chart) — pull
@# the pinned dependency, then install. argo-rollouts first so the
@# Rollout CRDs exist before buyerchat renders a Rollout; argocd last.
@for chart in infra/argo-rollouts infra/argocd; do \
echo " Installing $$chart..."; \
helm dependency build $$chart >/dev/null 2>&1 || true; \
helm upgrade --install --create-namespace --namespace $$(basename $$chart) $$chart $$chart --timeout 300s --wait --debug 2>&1 | tail -3 || true; \
done
@bash scripts/bootstrap.sh

@echo "=== Installing buyerchat Helm chart ==="
helm upgrade --install buyerchat $(HELM_CHART) \
--namespace $(NAMESPACE) --create-namespace \
--values $(HELM_CHART)/values.dev.yaml \
--timeout 180s --wait

@echo "=== Registering the ArgoCD app-of-apps root ==="
@# From here on ArgoCD reconciles every component from git (automated
@# sync + prune + self-heal). The helm installs above bootstrap the
@# cluster on a clean machine; root-app.yaml is the GitOps takeover.
kubectl apply -f argocd/root-app.yaml

@echo ""
@echo "=== Cluster ready ==="
@kubectl get pods -A --no-headers | grep -v Running | grep -v Completed && echo "All pods running ✓" || true
@echo ""
@echo "Add to /etc/hosts:"
@echo " 127.0.0.1 buyerchat.local.stackup.dev"
@echo " 127.0.0.1 grafana.local.stackup.dev"
@echo " 127.0.0.1 argocd.local.stackup.dev"
@echo " 127.0.0.1 prometheus.local.stackup.dev"
@echo ""
@echo "Then: curl https://buyerchat.local.stackup.dev/api/healthcheck"
# Build + side-load the demo image without a full bring-up. Handy when
# iterating on the workload, or to stage a "bad" image for the rollback
# demo: make demo-image DEMO_IMAGE=stackup-demo:v2 then rebuild with
# --build-arg FAILURE_RATE=0.3 (see apps/demo/Dockerfile).
demo-image:
docker build -t $(DEMO_IMAGE) apps/demo
kind load docker-image $(DEMO_IMAGE) --name $(KIND_CLUSTER)

down:
@echo "=== Deleting kind cluster ==="
Expand All @@ -82,8 +52,10 @@ lint:

@echo ""
@echo "=== Helm lint ==="
@helm lint $(HELM_CHART) --quiet && echo "✓ helm lint passed" || echo "✗ helm lint failed"
@helm template buyerchat $(HELM_CHART) > /dev/null 2>&1 && echo "✓ helm template passed" || echo "✗ helm template failed"
@for chart in helm/demo helm/buyerchat; do \
helm lint $$chart --quiet && echo "✓ helm lint $$chart passed" || echo "✗ helm lint $$chart failed"; \
helm template $$(basename $$chart) $$chart > /dev/null 2>&1 && echo "✓ helm template $$chart passed" || echo "✗ helm template $$chart failed"; \
done

rollout-status:
kubectl argo rollouts get rollout buyerchat -n $(NAMESPACE) --watch
kubectl argo rollouts get rollout $(ROLLOUT) -n $(NAMESPACE) --watch
53 changes: 25 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Stackup

**Kubernetes on your laptop. ArgoCD + Argo Rollouts + Prometheus + Grafana. `make up` in 10 minutes. Free.**
**Kubernetes on your laptop. ArgoCD + Argo Rollouts + Prometheus + Grafana. `make up` in ~12–15 minutes. Free.**

[![CI](https://github.com/ykstorm/stackup/actions/workflows/ci.yml/badge.svg)](https://github.com/ykstorm/stackup/actions/workflows/ci.yml)
[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
Expand All @@ -13,23 +13,23 @@ Managed Kubernetes costs $200+/month minimum on cloud providers. Stackup runs th

What "full production stack" means: a real ArgoCD app-of-apps with 6 child applications, Argo Rollouts canary progressive delivery, Prometheus + Grafana observability, cert-manager TLS, Sealed Secrets encrypted in git, Calico NetworkPolicy enforcement, and Pod Security Standards `restricted` on every workload namespace.

The buyerchat workload deliberately runs degraded (no DB). That's intentional. The cluster is the demo — not the app.
The bootstrapped canary subject is a small `demo` service (Express + prom-client). The cluster is the point — not the app.

---

## What's in the box

| Layer | Component | What it does |
|---|---|---|
| **Cluster** | kind on Docker | 3-node K8s in containers |
| **Cluster** | kind on Docker | single-node K8s in containers |
| **CNI** | Calico | NetworkPolicy enforcement |
| **GitOps** | ArgoCD (app-of-apps) | One root app manages 6 children; automated sync + prune + self-heal |
| **Progressive delivery** | Argo Rollouts | Canary 25→50→75→100%, analysis gate at 25% with auto-rollback |
| **Progressive delivery** | Argo Rollouts | Canary 25→50→75→100%, success-rate analysis gate at 25% with auto-rollback |
| **Ingress** | ingress-nginx | TLS termination, hostPort 80/443 |
| **TLS** | cert-manager | Self-signed ClusterIssuer (swap to ACME in one line for prod) |
| **Secrets** | Sealed Secrets | Encrypted secrets in git, decrypted in-cluster |
| **Metrics** | kube-prometheus-stack | Prometheus + Alertmanager + Grafana, RED dashboards pre-imported |
| **Workload demo** | buyerchat Helm chart | Next.js app — demonstrates the cluster, not a production app |
| **Metrics** | kube-prometheus-stack | Prometheus + Alertmanager + Grafana |
| **Workload demo** | demo Helm chart (helm/demo) | Express service that exports `http_requests_total` — the canary subject |
| **Hardening** | PSS `restricted` + NetworkPolicy `default-deny` | Zero-trust on workload namespaces |

### Roadmap (not installed yet)
Expand All @@ -41,53 +41,50 @@ The buyerchat workload deliberately runs degraded (no DB). That's intentional. T

---

## 10-minute quickstart
## Quickstart

**Prerequisites:** Docker, `kind`, `kubectl`, `helm`. Give Docker **at least 6 GB of memory** (Docker Desktop → Settings → Resources). The full stack — Calico, kube-prometheus-stack, ArgoCD, and Argo Rollouts on one node — will start to crash-loop its controllers below ~4 GB.

```bash
git clone https://github.com/ykstorm/stackup && cd stackup
make up
```

Add to `/etc/hosts` (Windows: `C:\Windows\System32\drivers\etc\hosts`):
The ingress hosts use `localtest.me`, which resolves to `127.0.0.1` — no `/etc/hosts` editing. Open:

```
127.0.0.1 buyerchat.local.stackup.dev
127.0.0.1 grafana.local.stackup.dev
127.0.0.1 argocd.local.stackup.dev
127.0.0.1 prometheus.local.stackup.dev
```
- **[https://grafana.localtest.me](https://grafana.localtest.me)** — RED metrics from Prometheus (logs/traces are roadmap)
- **[https://argocd.localtest.me](https://argocd.localtest.me)** — GitOps tree of 6 child apps

Then open:
The `demo` workload has no ingress. Reach it by port-forward:

- **[https://buyerchat.local.stackup.dev](https://buyerchat.local.stackup.dev)** — workload, returns 503 degraded (no DB — expected)
- **[https://grafana.local.stackup.dev](https://grafana.local.stackup.dev)** — RED metrics from Prometheus (logs/traces are roadmap)
- **[https://argocd.local.stackup.dev](https://argocd.local.stackup.dev)** — GitOps tree of 6 child apps
```bash
kubectl -n app port-forward svc/demo 3000:3000
curl localhost:3000/metrics # shows http_requests_total
```

---

## What it actually shows you

Push a commit that bumps `helm/buyerchat/values.yaml` image.tag. ArgoCD notices and syncs. Argo Rollouts applies the new Rollout revision. Watch it advance:
Push a commit that bumps `helm/demo/values.yaml` image.tag. ArgoCD notices and syncs. Argo Rollouts applies the new Rollout revision. Watch it advance:

```bash
make rollout-status
# same as: kubectl argo rollouts get rollout buyerchat -n app --watch
# same as: kubectl argo rollouts get rollout demo -n app --watch
```

The canary shifts 25% of traffic to the new version, pauses, then runs an analysis step: an `AnalysisTemplate` queries Prometheus three times over 90 seconds. If the success condition holds, the rollout advances to 50%, then 75%, then 100%. If the analysis fails, Argo Rollouts aborts and rolls back to the previous revision. This is the canary pattern teams run in production, on your laptop, for free.
The canary shifts 25% of traffic to the new version, pauses, then runs an analysis step. The `AnalysisTemplate` queries Prometheus for the 2xx HTTP success-rate ratio over a 2-minute window — `sum(rate(http_requests_total{code=~"2.."}[2m])) / sum(rate(http_requests_total[2m]))`. If the result holds at or above 0.95, the rollout advances to 50%, then 75%, then 100%. If it drops below, Argo Rollouts aborts and rolls back to the previous revision. This is the canary pattern teams run in production, on your laptop, for free.

The current analysis query is a conservative liveness check (is the canary up and being scraped). Once the buyerchat image exports request counters on `/api/metrics`, swap it for a real success-rate ratio — the template carries a `TODO` marking the one line to change.
The `demo` image exports `http_requests_total` directly (Express + prom-client), so the gate runs against real request data. Set `failureRate` on the chart to push a deliberately bad canary and watch the rollback fire.

---

## Architecture

```mermaid
graph TD
Dev[Developer machine] -->|kind create cluster| Kind[kind cluster<br/>3 Docker nodes]
Dev[Developer machine] -->|kind create cluster| Kind[kind cluster<br/>single node]
Kind --> CP[Control plane]
Kind --> W1[Worker 1]
Kind --> W2[Worker 2]
CP --> Argo[ArgoCD]
Argo --> Apps[6 child apps]
Apps --> Rollout[Argo Rollouts CRD]
Expand All @@ -106,11 +103,11 @@ A static documentation site (overview, getting started, architecture, GitOps + c

```bash
make help # Show all targets
make up # Full bring-up: create cluster + install platform + buyerchat
make up # Full bring-up: create cluster + install platform + demo
make down # Tear down kind cluster (clean)
make smoke # Run smoke tests (requires cluster up)
make lint # Lint all YAML + Helm charts
make rollout-status # Watch the buyerchat Argo Rollout canary progress
make rollout-status # Watch the demo Argo Rollout canary progress
```

---
Expand All @@ -120,7 +117,7 @@ make rollout-status # Watch the buyerchat Argo Rollout canary progress
- No real LoadBalancer service type (kind doesn't ship one). We use hostPort. For real LB, deploy to a cloud cluster.
- Storage is local-path PVs by default. Re-creating the cluster wipes them. Add Longhorn or OpenEBS if you need persistence across teardowns.
- Single-tenant workload namespace. Multi-tenant needs additional NetworkPolicy and RBAC work (PRs welcome).
- The buyerchat workload runs degraded (no DB). That's intentional — the cluster is the demo, not the app.
- The `demo` workload is a stand-in for your real service — it exists to drive the canary, not to be a product. (A legacy `buyerchat` chart still lives in `helm/buyerchat` as an example; it is not what `make up` deploys.)

## License

Expand Down
4 changes: 4 additions & 0 deletions apps/demo/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
node_modules
npm-debug.log
test
.dockerignore
35 changes: 35 additions & 0 deletions apps/demo/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Demo workload image — the stackup canary subject.
#
# Build (from repo root):
# docker build -t stackup-demo:v1 apps/demo
#
# Load into the kind cluster (image is never pushed to a registry; kind
# side-loads it so ImagePullPolicy=IfNotPresent resolves locally):
# kind load docker-image stackup-demo:v1 --name stackup
#
# A "bad" build for the rollback demo just bakes a higher failure rate:
# docker build -t stackup-demo:v2 --build-arg FAILURE_RATE=0.3 apps/demo
# kind load docker-image stackup-demo:v2 --name stackup
FROM node:20-alpine

WORKDIR /app

# Install production deps first for layer caching.
COPY package*.json ./
RUN npm install --omit=dev

COPY server.js ./

# Default failure rate baked into the image; override per-build for the
# rollback demo, or at runtime via the FAILURE_RATE env var.
ARG FAILURE_RATE=0
ENV FAILURE_RATE=${FAILURE_RATE}
ENV PORT=3000
ENV SERVICE_NAME=demo

# Run as a non-root UID so the pod satisfies restricted Pod Security
# Standards (runAsNonRoot + runAsUser in the chart's securityContext).
USER 1001

EXPOSE 3000
CMD ["node", "server.js"]
Loading