diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c9df806..df7118e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,6 +67,45 @@ jobs: - name: Validate raw manifests run: find manifests/ -name '*.yaml' -print0 | xargs -0 kubeconform -strict -summary -ignore-missing-schemas + gitops: + name: GitOps (rollout + app-of-apps render) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install kubeconform + helm + run: | + curl -sSL -o /tmp/kubeconform.tar.gz https://github.com/yannh/kubeconform/releases/latest/download/kubeconform-linux-amd64.tar.gz + tar -xzf /tmp/kubeconform.tar.gz -C /tmp/ + chmod +x /tmp/kubeconform && sudo mv /tmp/kubeconform /usr/local/bin/kubeconform + curl -fsSL -o /tmp/helm.tar.gz https://get.helm.sh/helm-v3.15.0-linux-amd64.tar.gz + tar -xzf /tmp/helm.tar.gz -C /tmp/ + chmod +x /tmp/linux-amd64/helm && sudo mv /tmp/linux-amd64/helm /usr/local/bin/helm + + - name: Lint + render infra wrapper charts (argo-rollouts, argocd) + # These pin the upstream chart via Chart.yaml dependencies; build + # the dependency before lint/template. -ignore-missing-schemas + # because the upstream charts ship CRDs (Rollout, Application). + run: | + # Both wrapper charts pin upstream charts from the argo-helm repo; + # register it so `helm dependency build` can resolve them. + helm repo add argo https://argoproj.github.io/argo-helm + helm repo update + for chart in infra/argo-rollouts infra/argocd; do + helm dependency build "$chart" + helm lint "$chart" + helm template "$chart" | kubeconform -strict -summary -ignore-missing-schemas + done + + - name: Render rollout-enabled buyerchat + validate + # values.dev.yaml sets rollout.enabled=true, so this renders the + # Argo Rollout + AnalysisTemplate path (CRD-typed → needs + # -ignore-missing-schemas). + run: helm template helm/buyerchat -f helm/buyerchat/values.dev.yaml | kubeconform -strict -summary -ignore-missing-schemas + + - name: Validate app-of-apps manifests + run: kubeconform -strict -summary -ignore-missing-schemas argocd/root-app.yaml argocd/apps/*.yaml + build-docs: name: Build docs site runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 76d94bd..f6a4b76 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,10 @@ dist/ .DS_Store coverage/ *.tmp -kubeconform-cache/ \ No newline at end of file +kubeconform-cache/ + +# Vendored Helm chart dependencies (pulled by `helm dependency build`). +# Chart.lock is committed to pin exact versions; the .tgz archives are +# build artifacts and re-fetched in CI. +infra/*/charts/ +helm/*/charts/ \ No newline at end of file diff --git a/Makefile b/Makefile index 2a37764..1e67d8c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: up down smoke lint help +.PHONY: up down smoke lint rollout-status help KIND_CLUSTER := stackup HELM_CHART := helm/buyerchat @@ -7,10 +7,11 @@ NAMESPACE := app help: @echo "stackup Makefile" @echo "" - @echo " make up Full bring-up: create kind cluster + install all platform components + buyerchat" - @echo " make down Tear down: delete kind cluster (clean)" - @echo " make smoke Run smoke tests (requires cluster up)" - @echo " make lint Lint all YAML files + Helm charts" + @echo " make up Full bring-up: create kind cluster + install all platform components + buyerchat" + @echo " make down Tear down: delete kind cluster (clean)" + @echo " make smoke Run smoke tests (requires cluster up)" + @echo " make lint Lint all YAML files + Helm charts" + @echo " make rollout-status Watch the buyerchat Argo Rollout canary progress" @echo "" @echo "Prerequisites: docker, kind, helm >=3.15, kubectl, git" @@ -30,12 +31,28 @@ up: helm upgrade --install --create-namespace --namespace $$(basename $$chart) $$chart $$chart --timeout 120s --wait --debug 2>&1 | tail -3 || true; \ done + @echo "=== Installing Argo Rollouts + ArgoCD ===" + @# Wrapper charts (Chart.yaml dependency on the upstream chart) — pull + @# the pinned dependency, then install. argo-rollouts first so the + @# Rollout CRDs exist before buyerchat renders a Rollout; argocd last. + @for chart in infra/argo-rollouts infra/argocd; do \ + echo " Installing $$chart..."; \ + helm dependency build $$chart >/dev/null 2>&1 || true; \ + helm upgrade --install --create-namespace --namespace $$(basename $$chart) $$chart $$chart --timeout 300s --wait --debug 2>&1 | tail -3 || true; \ + done + @echo "=== Installing buyerchat Helm chart ===" helm upgrade --install buyerchat $(HELM_CHART) \ --namespace $(NAMESPACE) --create-namespace \ --values $(HELM_CHART)/values.dev.yaml \ --timeout 180s --wait + @echo "=== Registering the ArgoCD app-of-apps root ===" + @# From here on ArgoCD reconciles every component from git (automated + @# sync + prune + self-heal). The helm installs above bootstrap the + @# cluster on a clean machine; root-app.yaml is the GitOps takeover. + kubectl apply -f argocd/root-app.yaml + @echo "" @echo "=== Cluster ready ===" @kubectl get pods -A --no-headers | grep -v Running | grep -v Completed && echo "All pods running ✓" || true @@ -66,4 +83,7 @@ lint: @echo "" @echo "=== Helm lint ===" @helm lint $(HELM_CHART) --quiet && echo "✓ helm lint passed" || echo "✗ helm lint failed" - @helm template buyerchat $(HELM_CHART) > /dev/null 2>&1 && echo "✓ helm template passed" || echo "✗ helm template failed" \ No newline at end of file + @helm template buyerchat $(HELM_CHART) > /dev/null 2>&1 && echo "✓ helm template passed" || echo "✗ helm template failed" + +rollout-status: + kubectl argo rollouts get rollout buyerchat -n $(NAMESPACE) --watch \ No newline at end of file diff --git a/README.md b/README.md index 98ee2d4..716a495 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Managed Kubernetes costs $200+/month minimum on cloud providers. Stackup runs the full production stack on kind, on your laptop, for free. -What "full production stack" means: a real ArgoCD app-of-apps with 8 child applications, Argo Rollouts canary progressive delivery, Prometheus + Loki + Tempo observability, cert-manager TLS, Sealed Secrets encrypted in git, Calico NetworkPolicy enforcement, and Pod Security Standards `restricted` on every workload namespace. +What "full production stack" means: a real ArgoCD app-of-apps with 6 child applications, Argo Rollouts canary progressive delivery, Prometheus + Grafana observability, cert-manager TLS, Sealed Secrets encrypted in git, Calico NetworkPolicy enforcement, and Pod Security Standards `restricted` on every workload namespace. The buyerchat workload deliberately runs degraded (no DB). That's intentional. The cluster is the demo — not the app. @@ -23,8 +23,8 @@ The buyerchat workload deliberately runs degraded (no DB). That's intentional. T |---|---|---| | **Cluster** | kind on Docker | 3-node K8s in containers | | **CNI** | Calico | NetworkPolicy enforcement | -| **GitOps** | ArgoCD (app-of-apps) | One root app manages 8 children; automated sync + prune + self-heal | -| **Progressive delivery** | Argo Rollouts | Canary 25→50→75→100%, auto-rollback on error spike | +| **GitOps** | ArgoCD (app-of-apps) | One root app manages 6 children; automated sync + prune + self-heal | +| **Progressive delivery** | Argo Rollouts | Canary 25→50→75→100%, analysis gate at 25% with auto-rollback | | **Ingress** | ingress-nginx | TLS termination, hostPort 80/443 | | **TLS** | cert-manager | Self-signed ClusterIssuer (swap to ACME in one line for prod) | | **Secrets** | Sealed Secrets | Encrypted secrets in git, decrypted in-cluster | @@ -56,19 +56,22 @@ Then open: - **[https://buyerchat.local.stackup.dev](https://buyerchat.local.stackup.dev)** — workload, returns 503 degraded (no DB — expected) - **[https://grafana.local.stackup.dev](https://grafana.local.stackup.dev)** — RED metrics + Loki logs + Tempo traces -- **[https://argocd.local.stackup.dev](https://argocd.local.stackup.dev)** — GitOps tree of 8 child apps +- **[https://argocd.local.stackup.dev](https://argocd.local.stackup.dev)** — GitOps tree of 6 child apps --- ## What it actually shows you -Push a commit that bumps `helm/buyerchat/values.yaml` image.tag. ArgoCD notices. Argo Rollouts applies the new Rollout resource. Watch: +Push a commit that bumps `helm/buyerchat/values.yaml` image.tag. ArgoCD notices and syncs. Argo Rollouts applies the new Rollout revision. Watch it advance: ```bash -kubectl argo rollouts get rollout buyerchat -n app --watch +make rollout-status +# same as: kubectl argo rollouts get rollout buyerchat -n app --watch ``` -The canary scales to 25% replicas. Prometheus watches error rate for 60 seconds. If clean, advances to 50%. Then 75%. Then 100%. If error rate spikes, automatic rollback. This is the pattern Lyft and Netflix run in production. Running on your laptop. Free. +The canary shifts 25% of traffic to the new version, pauses, then runs an analysis step: an `AnalysisTemplate` queries Prometheus three times over 90 seconds. If the success condition holds, the rollout advances to 50%, then 75%, then 100%. If the analysis fails, Argo Rollouts aborts and rolls back to the previous revision. This is the canary pattern teams run in production, on your laptop, for free. + +The current analysis query is a conservative liveness check (is the canary up and being scraped). Once the buyerchat image exports request counters on `/api/metrics`, swap it for a real success-rate ratio — the template carries a `TODO` marking the one line to change. --- @@ -81,7 +84,7 @@ graph TD Kind --> W1[Worker 1] Kind --> W2[Worker 2] CP --> Argo[ArgoCD] - Argo --> Apps[8 child apps] + Argo --> Apps[6 child apps] Apps --> Rollout[Argo Rollouts CRD] Rollout --> Pods[Canary pods] Pods --> Prom[Prometheus] @@ -100,10 +103,11 @@ For full topology + sequence diagrams, see [docs/architecture.md](docs/architect ```bash make help # Show all targets -make up # Full bring-up: create cluster + install platform + buyerchat -make down # Tear down kind cluster (clean) -make smoke # Run smoke tests (requires cluster up) -make lint # Lint all YAML + Helm charts +make up # Full bring-up: create cluster + install platform + buyerchat +make down # Tear down kind cluster (clean) +make smoke # Run smoke tests (requires cluster up) +make lint # Lint all YAML + Helm charts +make rollout-status # Watch the buyerchat Argo Rollout canary progress ``` --- diff --git a/argocd/apps/argo-rollouts.yaml b/argocd/apps/argo-rollouts.yaml new file mode 100644 index 0000000..bcf807a --- /dev/null +++ b/argocd/apps/argo-rollouts.yaml @@ -0,0 +1,27 @@ +# Child app: argo-rollouts. +# +# Points at this repo's self-contained wrapper chart at +# infra/argo-rollouts (Chart.yaml pins the upstream chart as a +# dependency). ArgoCD builds the dependency and renders it in place. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: argo-rollouts + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + source: + repoURL: https://github.com/ykstorm/stackup + path: infra/argo-rollouts + targetRevision: main + destination: + server: https://kubernetes.default.svc + namespace: argo-rollouts + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/buyerchat.yaml b/argocd/apps/buyerchat.yaml new file mode 100644 index 0000000..0a2ed2a --- /dev/null +++ b/argocd/apps/buyerchat.yaml @@ -0,0 +1,32 @@ +# Child app: buyerchat. +# +# Points at this repo's helm/buyerchat chart with the dev values file, +# which sets rollout.enabled=true — so ArgoCD renders the Argo Rollout + +# AnalysisTemplate (not the plain Deployment). Installed into the +# `app` namespace (matches the Makefile NAMESPACE and the documented +# `kubectl argo rollouts get rollout buyerchat -n app` command). +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: buyerchat + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + source: + repoURL: https://github.com/ykstorm/stackup + path: helm/buyerchat + targetRevision: main + helm: + valueFiles: + - values.dev.yaml + destination: + server: https://kubernetes.default.svc + namespace: app + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/cert-manager.yaml b/argocd/apps/cert-manager.yaml new file mode 100644 index 0000000..9f87a51 --- /dev/null +++ b/argocd/apps/cert-manager.yaml @@ -0,0 +1,33 @@ +# Child app: cert-manager. +# +# Upstream chart (pinned) with installCRDs=true, matching the documented +# install in infra/cert-manager/README.md. The selfsigned ClusterIssuer +# (infra/cert-manager/clusterissuer-selfsigned.yaml) is applied out of +# band by bring-up — it is a cert-manager CRD object, not part of this +# chart's render. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cert-manager + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + source: + repoURL: https://charts.jetstack.io + chart: cert-manager + targetRevision: v1.20.2 + helm: + parameters: + - name: installCRDs + value: "true" + destination: + server: https://kubernetes.default.svc + namespace: cert-manager + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/ingress-nginx.yaml b/argocd/apps/ingress-nginx.yaml new file mode 100644 index 0000000..534d8bd --- /dev/null +++ b/argocd/apps/ingress-nginx.yaml @@ -0,0 +1,34 @@ +# Child app: ingress-nginx. +# +# Multi-source Application: the upstream chart (pinned) plus this repo's +# values-only overlay at infra/ingress-nginx/values.yaml, referenced via +# the $values source ref. This mirrors the documented install in +# infra/ingress-nginx/README.md. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ingress-nginx + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + sources: + - repoURL: https://kubernetes.github.io/ingress-nginx + chart: ingress-nginx + targetRevision: 4.15.1 + helm: + valueFiles: + - $values/infra/ingress-nginx/values.yaml + - repoURL: https://github.com/ykstorm/stackup + targetRevision: main + ref: values + destination: + server: https://kubernetes.default.svc + namespace: ingress-nginx + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/kube-prometheus-stack.yaml b/argocd/apps/kube-prometheus-stack.yaml new file mode 100644 index 0000000..b62d9d2 --- /dev/null +++ b/argocd/apps/kube-prometheus-stack.yaml @@ -0,0 +1,37 @@ +# Child app: kube-prometheus-stack. +# +# Multi-source: the upstream chart (pinned) plus this repo's values-only +# overlay at infra/kube-prometheus-stack/values.yaml. The release name +# `kps` is load-bearing — ServiceMonitors elsewhere carry `release: kps` +# (see infra/kube-prometheus-stack/README.md). The AnalysisTemplate's +# Prometheus address also depends on this release name. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: kube-prometheus-stack + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + sources: + - repoURL: https://prometheus-community.github.io/helm-charts + chart: kube-prometheus-stack + targetRevision: 84.5.0 + helm: + releaseName: kps + valueFiles: + - $values/infra/kube-prometheus-stack/values.yaml + - repoURL: https://github.com/ykstorm/stackup + targetRevision: main + ref: values + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true diff --git a/argocd/apps/sealed-secrets.yaml b/argocd/apps/sealed-secrets.yaml new file mode 100644 index 0000000..953f8bc --- /dev/null +++ b/argocd/apps/sealed-secrets.yaml @@ -0,0 +1,27 @@ +# Child app: sealed-secrets. +# +# Upstream chart (pinned) installed into kube-system. The release name +# `sealed-secrets` and namespace `kube-system` are load-bearing — the +# repo's kubeseal invocations assume them (see infra/sealed-secrets/README.md). +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: sealed-secrets + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + source: + repoURL: https://bitnami-labs.github.io/sealed-secrets + chart: sealed-secrets + targetRevision: 2.18.6 + helm: + releaseName: sealed-secrets + destination: + server: https://kubernetes.default.svc + namespace: kube-system + syncPolicy: + automated: + prune: true + selfHeal: true diff --git a/argocd/root-app.yaml b/argocd/root-app.yaml new file mode 100644 index 0000000..dbbc494 --- /dev/null +++ b/argocd/root-app.yaml @@ -0,0 +1,35 @@ +# app-of-apps root. +# +# A single Application that points at argocd/apps/ in this repo. Every +# file there is itself an Application (one per platform component), so +# ArgoCD reconciling `root` pulls in the whole tree. Apply it once after +# ArgoCD is up: +# +# kubectl apply -f argocd/root-app.yaml +# +# automated sync keeps the children in step with main; prune removes +# resources dropped from git; selfHeal reverts out-of-band cluster edits. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: root + namespace: argocd + labels: + app.kubernetes.io/part-of: stackup +spec: + project: default + source: + repoURL: https://github.com/ykstorm/stackup + path: argocd/apps + targetRevision: main + directory: + recurse: false + destination: + server: https://kubernetes.default.svc + namespace: argocd + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/docs/gitops.md b/docs/gitops.md new file mode 100644 index 0000000..2c81c9f --- /dev/null +++ b/docs/gitops.md @@ -0,0 +1,91 @@ +# GitOps and progressive delivery + +This cluster is reconciled by ArgoCD from git, and the buyerchat workload +ships through an Argo Rollouts canary. This page describes both. + +## App-of-apps + +ArgoCD runs in the `argocd` namespace (installed from `infra/argocd`, a +wrapper chart that pins the upstream `argo/argo-cd` chart). The entry +point is one root `Application`: + +```sh +kubectl apply -f argocd/root-app.yaml +``` + +`root` points at `argocd/apps/` in this repo. Every file there is itself +an `Application`, one per platform component, so reconciling `root` pulls +in the whole tree. The six children are: + +| Application | Source | Namespace | +|---|---|---| +| `ingress-nginx` | upstream chart + `infra/ingress-nginx/values.yaml` | `ingress-nginx` | +| `cert-manager` | upstream chart (`installCRDs=true`) | `cert-manager` | +| `sealed-secrets` | upstream chart | `kube-system` | +| `kube-prometheus-stack` | upstream chart + `infra/kube-prometheus-stack/values.yaml` | `monitoring` | +| `argo-rollouts` | `infra/argo-rollouts` wrapper chart | `argo-rollouts` | +| `buyerchat` | `helm/buyerchat` with `values.dev.yaml` | `app` | + +Each child and the root run `syncPolicy.automated` with `prune: true` and +`selfHeal: true`: git is the source of truth. A resource deleted from git +is pruned from the cluster; an out-of-band cluster edit is reverted on the +next sync. + +The values-only overlays (ingress-nginx, kube-prometheus-stack) use an +ArgoCD multi-source `Application`: one source is the pinned upstream chart, +the second is this repo, referenced as `$values` so the chart reads the +in-repo values file. The two wrapper charts (argo-rollouts, argocd) are +self-contained — their `Chart.yaml` pins the upstream chart as a +dependency, so a single in-repo path renders them. + +## Bootstrap vs. takeover + +`make up` (and `scripts/up.ps1`) install the platform charts directly with +`helm upgrade --install` to get a clean machine to a working state, then +apply `argocd/root-app.yaml`. From that point ArgoCD owns the components +and reconciles them from `main`. The direct helm installs are the +bootstrap; the root app is the handoff. + +## Canary + +With `rollout.enabled: true` (set in `helm/buyerchat/values.dev.yaml`) the +buyerchat chart renders an Argo `Rollout` instead of a `Deployment`. The +pod template is identical — same security context, tcpSocket probes, +volumes, and NetworkPolicies carry over unchanged. Only the rollout +strategy differs: + +``` +setWeight 25 → pause 30s → analysis → setWeight 50 → pause 30s + → setWeight 75 → pause 30s → setWeight 100 +``` + +Watch a rollout advance: + +```sh +make rollout-status +# kubectl argo rollouts get rollout buyerchat -n app --watch +``` + +### The analysis gate + +The `analysis` step (after the 25% weight) runs the +`buyerchat-success-rate` `AnalysisTemplate`. It queries Prometheus (the +`kps` release, at `http://kps-kube-prometheus-stack-prometheus.monitoring:9090`) +three times at 30s intervals. If the success condition fails more than +once, the Rollout aborts and reverts to the previous revision. + +The query today is a conservative liveness check — the fraction of +buyerchat targets Prometheus reports as scrapeable. The buyerchat image +runs degraded (no DB) and its exported metric names are not confirmable +from this repo, so the template does not claim a request-success-rate gate +it cannot compute. A `TODO` in `helm/buyerchat/templates/analysis-template.yaml` +marks the one query to replace once the image exports request counters on +`/api/metrics`. + +## Maintenance note + +The buyerchat pod template lives in both `deployment.yaml` and +`rollout.yaml` (one renders, the other is suppressed by +`rollout.enabled`). A change to the pod spec must be made in both files. +Extracting the shared template into a `_helpers.tpl` named template would +remove the duplication — a worthwhile follow-up. diff --git a/helm/buyerchat/templates/analysis-template.yaml b/helm/buyerchat/templates/analysis-template.yaml new file mode 100644 index 0000000..e6d785c --- /dev/null +++ b/helm/buyerchat/templates/analysis-template.yaml @@ -0,0 +1,36 @@ +{{- if .Values.rollout.enabled }} +# AnalysisTemplate for the buyerchat canary. The Rollout's `analysis` step +# (after the 25% weight) runs this; if successCondition fails more than +# failureLimit times, the Rollout aborts and rolls back. +# +# Prometheus address: the kube-prometheus-stack release `kps` exposes its +# Prometheus at this service (see infra/kube-prometheus-stack/README.md). +# +# TODO: replace the `up`-based query with a real success-rate metric once +# buyerchat exports request counters on /api/metrics. The showcase app +# runs degraded (no DB) and its exported metric names are not confirmable +# from this repo, so this template uses a conservative liveness query — +# the fraction of buyerchat targets Prometheus reports as scrapeable. It +# evaluates a real signal (is the canary up and being scraped?) without +# claiming a request-success-rate gate that cannot actually be computed. +apiVersion: argoproj.io/v1alpha1 +kind: AnalysisTemplate +metadata: + name: {{ include "buyerchat.fullname" . }}-success-rate + labels: + {{- include "buyerchat.labels" . | nindent 4 }} +spec: + metrics: + - name: success-rate + interval: {{ .Values.rollout.analysis.interval }} + count: {{ .Values.rollout.analysis.count }} + successCondition: result[0] >= {{ .Values.rollout.analysis.successThreshold }} + failureLimit: {{ .Values.rollout.analysis.failureLimit }} + provider: + prometheus: + address: {{ .Values.rollout.analysis.prometheusAddress }} + query: >- + sum(up{job="{{ include "buyerchat.fullname" . }}"}) + / + count(up{job="{{ include "buyerchat.fullname" . }}"}) +{{- end }} diff --git a/helm/buyerchat/templates/deployment.yaml b/helm/buyerchat/templates/deployment.yaml index 46625d8..49c727d 100644 --- a/helm/buyerchat/templates/deployment.yaml +++ b/helm/buyerchat/templates/deployment.yaml @@ -1,3 +1,7 @@ +{{- if not .Values.rollout.enabled }} +# Plain Deployment — the default path. When .Values.rollout.enabled is +# true (values.dev.yaml), rollout.yaml renders an Argo Rollout instead and +# this file produces nothing, so exactly one workload object exists. apiVersion: apps/v1 kind: Deployment metadata: @@ -76,3 +80,4 @@ spec: emptyDir: {} - name: nextjs-cache emptyDir: {} +{{- end }} diff --git a/helm/buyerchat/templates/rollout.yaml b/helm/buyerchat/templates/rollout.yaml new file mode 100644 index 0000000..81e74b3 --- /dev/null +++ b/helm/buyerchat/templates/rollout.yaml @@ -0,0 +1,100 @@ +{{- if .Values.rollout.enabled }} +# Argo Rollout — the canary equivalent of deployment.yaml. Exactly one of +# Deployment/Rollout renders: this file is gated on .Values.rollout.enabled, +# deployment.yaml on `not .Values.rollout.enabled`. +# +# MAINTENANCE: the pod template below is a verbatim copy of the one in +# deployment.yaml (same containers, securityContext, volumeMounts, the +# tcpSocket probes + their rationale, volumes, automountServiceAccountToken). +# Any change to the buyerchat pod spec must be made in BOTH files until the +# template is extracted into a shared _helpers.tpl named template. +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: {{ include "buyerchat.fullname" . }} + labels: + {{- include "buyerchat.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + revisionHistoryLimit: 3 + strategy: + canary: + steps: + - setWeight: 25 + - pause: { duration: 30s } + - analysis: + templates: + - templateName: {{ include "buyerchat.fullname" . }}-success-rate + - setWeight: 50 + - pause: { duration: 30s } + - setWeight: 75 + - pause: { duration: 30s } + - setWeight: 100 + selector: + matchLabels: + {{- include "buyerchat.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "buyerchat.labels" . | nindent 8 }} + spec: + automountServiceAccountToken: false + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: buyerchat + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: 3000 + protocol: TCP + envFrom: + - secretRef: + name: buyerchat-env + securityContext: + {{- toYaml .Values.containerSecurityContext | nindent 12 }} + volumeMounts: + # Restricted PSS + readOnlyRootFilesystem requires every + # writable path to be an explicit volume. /tmp + the Next.js + # standalone build's runtime cache directory are the two + # paths the image is known to write to. + - name: tmp + mountPath: /tmp + - name: nextjs-cache + mountPath: /app/.next/cache + resources: + {{- toYaml .Values.resources | nindent 12 }} + # tcpSocket startupProbe — NOT httpGet. /api/healthcheck + # returns 503 in degraded mode, and kubelet HTTP probes treat + # any non-2xx/3xx as failure regardless of probe type + # (startup/readiness/liveness), so HTTP probing the degraded + # healthcheck would CrashLoopBackOff the pod. TCP probes only + # check port availability; Next.js binds :3000 within ~1s of + # boot, the probe passes, and the pod becomes Ready. See + # docs/diagnostics/p3-week1/progress.md (Day-2 entry) for the + # incident that produced this rule. + startupProbe: + tcpSocket: + port: http + failureThreshold: {{ .Values.probes.startup.failureThreshold }} + periodSeconds: {{ .Values.probes.startup.periodSeconds }} + timeoutSeconds: {{ .Values.probes.startup.timeoutSeconds }} + livenessProbe: + tcpSocket: + port: http + periodSeconds: {{ .Values.probes.liveness.periodSeconds }} + timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.liveness.failureThreshold }} + readinessProbe: + tcpSocket: + port: http + periodSeconds: {{ .Values.probes.readiness.periodSeconds }} + timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.readiness.failureThreshold }} + volumes: + - name: tmp + emptyDir: {} + - name: nextjs-cache + emptyDir: {} +{{- end }} diff --git a/helm/buyerchat/values.dev.yaml b/helm/buyerchat/values.dev.yaml index ff241bf..6381a7f 100644 --- a/helm/buyerchat/values.dev.yaml +++ b/helm/buyerchat/values.dev.yaml @@ -11,6 +11,12 @@ replicaCount: 1 +# Render an Argo Rollout (canary) instead of a plain Deployment on the +# kind cluster, so `kubectl argo rollouts get rollout buyerchat -n app` +# and the documented canary flow work end to end. +rollout: + enabled: true + resources: requests: cpu: "50m" diff --git a/helm/buyerchat/values.yaml b/helm/buyerchat/values.yaml index 475989c..649dd41 100644 --- a/helm/buyerchat/values.yaml +++ b/helm/buyerchat/values.yaml @@ -10,9 +10,27 @@ image: replicaCount: 2 +# Progressive delivery. When rollout.enabled is false (the default), the +# chart renders a plain Deployment using the `strategy` block below. When +# true (set in values.dev.yaml for the kind cluster), the chart renders an +# Argo Rollout with a canary strategy + AnalysisTemplate instead, and the +# Deployment is suppressed — exactly one workload object exists either way. +rollout: + enabled: false + analysis: + # The kube-prometheus-stack `kps` release exposes Prometheus here. + prometheusAddress: http://kps-kube-prometheus-stack-prometheus.monitoring:9090 + interval: 30s + count: 3 + failureLimit: 1 + # successCondition is `result[0] >= successThreshold`. With the + # current up-based query (see analysis-template.yaml TODO), 0.95 means + # at least 95% of buyerchat targets must be scrapeable. + successThreshold: "0.95" + # Deployment update strategy. RollingUpdate is fine here (no hostPort # contention — the workload runs behind a Service, not directly on -# host ports). +# host ports). Only used when rollout.enabled is false. strategy: type: RollingUpdate rollingUpdate: diff --git a/infra/argo-rollouts/Chart.lock b/infra/argo-rollouts/Chart.lock new file mode 100644 index 0000000..887569a --- /dev/null +++ b/infra/argo-rollouts/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: argo-rollouts + repository: https://argoproj.github.io/argo-helm + version: 2.41.0 +digest: sha256:bec9b087d4f7f56f397f2fde9f215c83057859311596ed467549a3670b3862ce +generated: "2026-06-15T05:37:44.858244+05:30" diff --git a/infra/argo-rollouts/Chart.yaml b/infra/argo-rollouts/Chart.yaml new file mode 100644 index 0000000..1b12890 --- /dev/null +++ b/infra/argo-rollouts/Chart.yaml @@ -0,0 +1,30 @@ +# Wrapper chart for Argo Rollouts. +# +# The other infra/* overlays (ingress-nginx, cert-manager, sealed-secrets, +# kube-prometheus-stack) are values-only and installed by name from the +# upstream repo (see each dir's README). This one carries a Chart.yaml +# dependencies block instead, for two reasons: +# 1. `helm lint infra/argo-rollouts` is a CI gate, and lint needs a +# Chart.yaml to run. +# 2. The ArgoCD app-of-apps points an Application at this directory; a +# self-contained chart renders without an out-of-band `helm repo add`. +# The upstream READMEs already anticipated this "Chart.yaml shim" path. +apiVersion: v2 +name: argo-rollouts +description: |- + Argo Rollouts controller for the showcase cluster. Installs the canary + controller that drives the buyerchat Rollout's progressive-delivery + steps and evaluates its AnalysisTemplate. +type: application +version: 0.1.0 +appVersion: "1.9.0" +home: https://github.com/ykstorm/stackup +sources: + - https://github.com/ykstorm/stackup +maintainers: + - name: ykstorm +dependencies: + # Pinned, not floated. Bump deliberately and re-run `helm dependency build`. + - name: argo-rollouts + version: 2.41.0 + repository: https://argoproj.github.io/argo-helm diff --git a/infra/argo-rollouts/README.md b/infra/argo-rollouts/README.md new file mode 100644 index 0000000..576afc0 --- /dev/null +++ b/infra/argo-rollouts/README.md @@ -0,0 +1,29 @@ +# argo-rollouts (P3 Day 6) + +Installs the Argo Rollouts controller (namespace `argo-rollouts`). The +controller watches `Rollout` objects and drives the buyerchat canary +through its weight steps, pausing on each `analysis` step to evaluate the +`AnalysisTemplate` against Prometheus before advancing. + +Unlike the older `infra/*` overlays, this directory is a self-contained +wrapper chart: `Chart.yaml` pins the upstream `argo/argo-rollouts` chart +(`2.41.0`) as a dependency, so the ArgoCD app-of-apps can point an +`Application` straight at it and `helm lint` runs in CI. + +## Install / upgrade + +```sh +helm dependency build infra/argo-rollouts +helm upgrade --install argo-rollouts infra/argo-rollouts \ + --namespace argo-rollouts --create-namespace \ + --wait --timeout 5m +``` + +## Verify + +```sh +kubectl get pods -n argo-rollouts +# expect: argo-rollouts controller 1/1 Running + +kubectl argo rollouts version +``` diff --git a/infra/argo-rollouts/values.yaml b/infra/argo-rollouts/values.yaml new file mode 100644 index 0000000..8963c1a --- /dev/null +++ b/infra/argo-rollouts/values.yaml @@ -0,0 +1,26 @@ +# Argo Rollouts values for the showcase cluster. +# +# Values for the upstream chart are nested under its dependency alias +# `argo-rollouts` (the dependency name in Chart.yaml). +# +# Kept minimal: the controller is all the buyerchat canary needs. The +# kubectl-plugin dashboard is a separate client-side tool (installed on +# the operator's laptop, not in-cluster) and is not enabled here. + +argo-rollouts: + # Controller only. The dashboard pod is off by default upstream; we + # keep it off — the `kubectl argo rollouts` plugin reads the Rollout + # CRs directly and needs no in-cluster dashboard. + dashboard: + enabled: false + + controller: + # Single replica is enough on a single-node kind cluster. + replicas: 1 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi diff --git a/infra/argocd/Chart.lock b/infra/argocd/Chart.lock new file mode 100644 index 0000000..7b8bed2 --- /dev/null +++ b/infra/argocd/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: argo-cd + repository: https://argoproj.github.io/argo-helm + version: 9.5.21 +digest: sha256:b9601d0b4b40434d1f292315e08b941ab6785cf1491604b75aa801ba98e4afe3 +generated: "2026-06-15T05:38:16.8262934+05:30" diff --git a/infra/argocd/Chart.yaml b/infra/argocd/Chart.yaml new file mode 100644 index 0000000..d8b220c --- /dev/null +++ b/infra/argocd/Chart.yaml @@ -0,0 +1,24 @@ +# Wrapper chart for ArgoCD. +# +# Same shim pattern as infra/argo-rollouts: a Chart.yaml dependencies +# block pinning the upstream chart, so `helm lint infra/argocd` runs in +# CI and the app-of-apps can render this directory without a separate +# `helm repo add`. +apiVersion: v2 +name: argocd +description: |- + ArgoCD for the showcase cluster. Runs the GitOps control plane that + reconciles the app-of-apps tree under argocd/ against this repo. +type: application +version: 0.1.0 +appVersion: "3.4.3" +home: https://github.com/ykstorm/stackup +sources: + - https://github.com/ykstorm/stackup +maintainers: + - name: ykstorm +dependencies: + # Pinned, not floated. Bump deliberately and re-run `helm dependency build`. + - name: argo-cd + version: 9.5.21 + repository: https://argoproj.github.io/argo-helm diff --git a/infra/argocd/README.md b/infra/argocd/README.md new file mode 100644 index 0000000..f1b8c53 --- /dev/null +++ b/infra/argocd/README.md @@ -0,0 +1,40 @@ +# argocd (P3 Day 6) + +Installs ArgoCD (namespace `argocd`) — the GitOps control plane. Once up, +`kubectl apply -f argocd/root-app.yaml` registers the app-of-apps root, +which reconciles one child `Application` per platform component (see +`argocd/apps/`) against this repo with automated sync, prune, and +self-heal. + +This directory is a self-contained wrapper chart: `Chart.yaml` pins the +upstream `argo/argo-cd` chart (`9.5.21`) as a dependency, so `helm lint` +runs in CI and the bring-up renders without an out-of-band `helm repo add`. + +## Install / upgrade + +```sh +helm dependency build infra/argocd +helm upgrade --install argocd infra/argocd \ + --namespace argocd --create-namespace \ + --wait --timeout 5m + +kubectl apply -f argocd/root-app.yaml +``` + +## Access + +The server is reachable at https://argocd.local.stackup.dev (TLS +terminated by cert-manager's selfsigned ClusterIssuer; `curl -k` for the +self-signed chain). The initial admin password: + +```sh +kubectl -n argocd get secret argocd-initial-admin-secret \ + -o jsonpath='{.data.password}' | base64 -d +``` + +## Why insecure server behind the ingress + +The ingress terminates TLS. Running the ArgoCD server in insecure (plain +HTTP) mode behind it (`configs.params."server.insecure": true`) avoids a +second TLS hop where the server would serve its own cert under the +ingress cert. This is the standard ingress-fronted ArgoCD posture. diff --git a/infra/argocd/values.yaml b/infra/argocd/values.yaml new file mode 100644 index 0000000..9145938 --- /dev/null +++ b/infra/argocd/values.yaml @@ -0,0 +1,49 @@ +# ArgoCD values for the showcase cluster. +# +# Values for the upstream chart are nested under its dependency alias +# `argo-cd` (the dependency name in Chart.yaml). +# +# Kept minimal: server + repo-server + application-controller, with an +# ingress on argocd.local.stackup.dev. TLS is terminated at the ingress +# by cert-manager (the selfsigned ClusterIssuer), and the server runs in +# insecure mode behind it so there is no double-TLS hop. + +argo-cd: + configs: + params: + # The ingress terminates TLS via cert-manager. Running the server + # insecure (plain HTTP) behind it avoids the controller serving its + # own self-signed cert on top of the ingress cert (a redirect loop + # / handshake mismatch). Standard pattern for ingress-fronted ArgoCD. + server.insecure: true + + server: + # Single replica on a single-node kind cluster. + replicas: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi + + ingress: + enabled: true + ingressClassName: nginx + # The Makefile already prints this host in its /etc/hosts hint. + hostname: argocd.local.stackup.dev + annotations: + cert-manager.io/cluster-issuer: selfsigned + tls: true + # The chart names the TLS secret -tls when tls: true and + # no explicit secret is given; cert-manager fills it via the + # selfsigned issuer above. + + # Trim the optional sub-components — not needed for the app-of-apps demo. + dex: + enabled: false + notifications: + enabled: false + applicationSet: + enabled: false diff --git a/scripts/up.ps1 b/scripts/up.ps1 index 1f0958b..a3120be 100644 --- a/scripts/up.ps1 +++ b/scripts/up.ps1 @@ -174,19 +174,22 @@ Write-OK "Image side-loaded; ImagePullPolicy=IfNotPresent will not hit the regis # helm/buyerchat/. Only 00-namespace.yaml remains here, because the # chart doesn't manage the Namespace (PSS labels are an # infrastructure-level concern, not a workload concern, and the chart -# may eventually be installed into multiple-tenant namespaces). The -# helm install steps for the buyerchat workload are operator-driven -# until Day 6 wires ArgoCD to take over. +# may eventually be installed into multiple-tenant namespaces). +# +# GitOps takeover (Day 6): ArgoCD and Argo Rollouts are now installed +# from infra/argocd and infra/argo-rollouts, and argocd/root-app.yaml +# registers the app-of-apps that reconciles every component from git. +# The bootstrap helm installs below get the cluster to a state where +# ArgoCD can take over; from then on git is the source of truth. Write-Step "Namespace - kubectl apply -f $ManifestsDir" & kubectl apply -f $ManifestsDir if ($LASTEXITCODE -ne 0) { throw "kubectl apply on buyerchat manifests failed" } Write-OK 'buyerchat namespace applied (with restricted-PSS labels)' # --------------------------------------------------------------------- -# Day-3 follow-up instructions (helm installs are operator-driven -# until Day 6 GitOps takeover) +# Follow-up instructions: bootstrap the platform, then hand off to GitOps # --------------------------------------------------------------------- -Write-Step 'Day-3 follow-ups (run after this script completes)' +Write-Step 'Follow-ups (run after this script completes)' Write-Host '' Write-Host ' Foundation infra (~3 min total):' -ForegroundColor White Write-Host '' @@ -204,15 +207,33 @@ Write-Host ' kubectl apply -f infra/cert-manager/clusterissuer-selfsigned.yam Write-Host ' helm upgrade --install sealed-secrets sealed-secrets/sealed-secrets \' Write-Host ' -n kube-system --wait' Write-Host '' +Write-Host ' Argo Rollouts + ArgoCD (the GitOps control plane):' -ForegroundColor White +Write-Host '' +Write-Host ' # Wrapper charts under infra/ pin the upstream chart as a' +Write-Host ' # dependency, so build it first, then install.' +Write-Host ' helm dependency build infra/argo-rollouts' +Write-Host ' helm upgrade --install argo-rollouts infra/argo-rollouts \' +Write-Host ' -n argo-rollouts --create-namespace --wait' +Write-Host ' helm dependency build infra/argocd' +Write-Host ' helm upgrade --install argocd infra/argocd \' +Write-Host ' -n argocd --create-namespace --wait' +Write-Host '' Write-Host ' Re-seal buyerchat-env (controller key is per-cluster — see' -ForegroundColor White Write-Host ' infra/sealed-secrets/README.md):' -ForegroundColor White Write-Host '' Write-Host ' # Recipe in helm/buyerchat/templates/sealed-secret.yaml leading comment.' Write-Host '' -Write-Host ' Workload:' -ForegroundColor White +Write-Host ' Workload + GitOps takeover:' -ForegroundColor White Write-Host '' Write-Host ' helm upgrade --install buyerchat ./helm/buyerchat \' -Write-Host ' -f helm/buyerchat/values.dev.yaml -n buyerchat --wait' +Write-Host ' -f helm/buyerchat/values.dev.yaml -n app --wait' +Write-Host ' # Register the app-of-apps root; ArgoCD reconciles every' +Write-Host ' # component from git from here on.' +Write-Host ' kubectl apply -f argocd/root-app.yaml' +Write-Host '' +Write-Host ' Watch the canary:' -ForegroundColor White +Write-Host '' +Write-Host ' kubectl argo rollouts get rollout buyerchat -n app --watch' Write-Host '' Write-Host ' Smoke test:' -ForegroundColor White Write-Host ''