diff --git a/helm/docs/monitoring-infrastructure.md b/helm/docs/monitoring-infrastructure.md index d346d131..df683b9f 100644 --- a/helm/docs/monitoring-infrastructure.md +++ b/helm/docs/monitoring-infrastructure.md @@ -8,14 +8,20 @@ how they were validated. The chart provides two categories of monitoring integration: -1. **Prometheus `prometheus.io/*` annotations** on all Services (always enabled). - These allow standard Prometheus installations using `kubernetes_sd_configs` - to auto-discover and scrape CloudZero Agent metrics without any CRDs. +1. **Prometheus `prometheus.io/*` annotations** on all Services (enabled by + default, controlled by `components.monitoring.scrapeAnnotations`). These + allow standard Prometheus installations using `kubernetes_sd_configs` to + auto-discover and scrape CloudZero Agent metrics without any CRDs. 2. **Prometheus Operator CRDs** (opt-in via `components.monitoring.enabled`). When enabled, the chart creates `ServiceMonitor` and `PrometheusRule` resources that the Prometheus Operator automatically picks up. +When both are active simultaneously, Prometheus deployments that honor both +annotation-based discovery and ServiceMonitors may scrape each target twice. +Set `components.monitoring.scrapeAnnotations: false` to disable the annotations +when using ServiceMonitors. + These resources are designed to be useful regardless of the customer's monitoring stack. The `ServiceMonitor` and `PrometheusRule` CRDs are the standard interoperability format understood by the Prometheus Operator, but @@ -32,6 +38,10 @@ components: # false = never install CRDs (default while feature is being validated) enabled: false + # true (default) = keep prometheus.io/* annotations on Services + # false = remove redundant annotations from Services + scrapeAnnotations: true + # Override namespace for CRDs (default: same as agent namespace) namespace: "" @@ -288,11 +298,15 @@ Validated using multiple test scenarios on the `bach` cluster: Tested via `helm template` with all three modes: -| `components.monitoring.enabled` | ServiceMonitors | PrometheusRules | `prometheus.io/*` annotations | -| ------------------------------- | --------------- | --------------- | ----------------------------- | -| `null` (no CRDs in cluster) | 0 | 0 | 3 (always) | -| `true` | 4 | 1 | 3 (always) | -| `false` | 0 | 0 | 3 (always) | +| `components.monitoring.enabled` | ServiceMonitors | PrometheusRules | `prometheus.io/*` annotations | +| ------------------------------- | --------------- | --------------- | ----------------------------------------- | +| `null` (no CRDs in cluster) | 0 | 0 | 3 | +| `true` | 4 | 1 | 3 | +| `false` | 0 | 0 | 3 | + + Annotation count assumes `components.monitoring.scrapeAnnotations: true` +(default). Set to `false` to omit annotations, e.g. when `enabled` is `true` or +`"auto"` to avoid duplicate scraping. ### Test Suite diff --git a/helm/templates/agent-service.yaml b/helm/templates/agent-service.yaml index eff24e69..ed3037ad 100644 --- a/helm/templates/agent-service.yaml +++ b/helm/templates/agent-service.yaml @@ -11,11 +11,15 @@ metadata: .Values.commonMetaLabels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if not (eq .Values.components.monitoring.scrapeAnnotations false) -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations - (dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics") + $promAnnotations ) ) | nindent 2 }} spec: diff --git a/helm/templates/aggregator-service.yaml b/helm/templates/aggregator-service.yaml index be32b42b..8fac4514 100644 --- a/helm/templates/aggregator-service.yaml +++ b/helm/templates/aggregator-service.yaml @@ -12,12 +12,16 @@ metadata: .Values.components.aggregator.labels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if not (eq .Values.components.monitoring.scrapeAnnotations false) -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations .Values.components.aggregator.annotations - (dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics") + $promAnnotations ) ) | nindent 2 }} spec: diff --git a/helm/templates/webhook-service.yaml b/helm/templates/webhook-service.yaml index b3703de4..c208a36d 100644 --- a/helm/templates/webhook-service.yaml +++ b/helm/templates/webhook-service.yaml @@ -11,13 +11,17 @@ metadata: .Values.components.webhookServer.labels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if not (eq .Values.components.monitoring.scrapeAnnotations false) -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations .Values.components.webhookServer.annotations (dict "nginx.ingress.kubernetes.io/ssl-redirect" "false") - (dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https") + $promAnnotations ) ) | nindent 2 }} namespace: {{ .Release.Namespace }} diff --git a/helm/tests/defaults_service_test.yaml b/helm/tests/defaults_service_test.yaml index 9aa083d5..7e7f4a59 100644 --- a/helm/tests/defaults_service_test.yaml +++ b/helm/tests/defaults_service_test.yaml @@ -2,15 +2,18 @@ # # This test validates that Service resources properly inherit # defaults.labels and defaults.annotations from the chart's defaults section. +# Also tests monitoring.scrapeAnnotations controls prometheus.io/* annotations. # # Services only support metadata-level defaults (labels and annotations). # PodSpec defaults (affinity, tolerations, etc.) do not apply to Services. # # Templates tested: +# - agent-service.yaml # - aggregator-service.yaml # - webhook-service.yaml suite: defaults.* properties apply to Service resources templates: + - agent-service.yaml - aggregator-service.yaml - webhook-service.yaml tests: @@ -91,3 +94,54 @@ tests: - equal: path: metadata.annotations.test-defaults-annotation value: sentinel-value-annotation + + # ============================================================================ + # monitoring.scrapeAnnotations tests + # ============================================================================ + - it: should include prometheus.io annotations on agent-service by default + template: agent-service.yaml + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should omit prometheus.io annotations on agent-service when scrapeAnnotations is false + template: agent-service.yaml + set: + components.monitoring.scrapeAnnotations: false + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should include prometheus.io annotations on aggregator-service by default + template: aggregator-service.yaml + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should omit prometheus.io annotations on aggregator-service when scrapeAnnotations is false + template: aggregator-service.yaml + set: + components.monitoring.scrapeAnnotations: false + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should include prometheus.io annotations on webhook-service by default + template: webhook-service.yaml + set: + insightsController.enabled: true + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should omit prometheus.io annotations on webhook-service when scrapeAnnotations is false + template: webhook-service.yaml + set: + insightsController.enabled: true + components.monitoring.scrapeAnnotations: false + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] diff --git a/helm/values.schema.json b/helm/values.schema.json index 0f8afc47..b23a586e 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -6309,6 +6309,10 @@ } ] }, + "scrapeAnnotations": { + "default": true, + "type": "boolean" + }, "sharedSecret": { "default": false, "type": "boolean" diff --git a/helm/values.yaml b/helm/values.yaml index 47e1ad8b..3d75bf4c 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -859,11 +859,27 @@ components: # # To opt in now, set to "auto" or true. # - # Regardless of this setting, prometheus.io/* annotations are always added to - # Services for customers using standard Prometheus service discovery. + # By default, prometheus.io/* annotations are added to Services for customers + # using standard Prometheus service discovery. Set monitoring.scrapeAnnotations: + # false to disable them when using Prometheus Operator ServiceMonitors to avoid + # Prometheus scraping each target twice. monitoring: enabled: null + # Controls whether prometheus.io/* annotations are added to Services. + # + # Background: When monitoring.enabled is true, the chart creates + # ServiceMonitor CRDs that instruct the Prometheus Operator to scrape + # CloudZero Agent metrics. In that setup, the prometheus.io/* annotations on + # Services become redundant and in clusters where both annotation-based + # and CRD-based discovery are active, same metrics could be scraped twice. + # + # - true (default): Keep the prometheus.io/* annotations set on Services. + # This value ensures backward compatibility + # + # - false: Remove the redundant prometheus.io/* annotations from Services. + scrapeAnnotations: true + # Namespace override for PrometheusRule and ServiceMonitor CRDs. # null (default) = same namespace as the agent installation. # Some Prometheus Operator deployments require CRDs to be in a specific diff --git a/tests/helm/template/alloy.yaml b/tests/helm/template/alloy.yaml index 8ed6edc7..0778c99c 100644 --- a/tests/helm/template/alloy.yaml +++ b/tests/helm/template/alloy.yaml @@ -1104,6 +1104,7 @@ data: enabled: null labels: {} namespace: null + scrapeAnnotations: true sharedSecret: false prometheus: image: