From 20a7bf75925228308f1e1c9a3a57bca3291611f7 Mon Sep 17 00:00:00 2001 From: Andrea Zagarella Date: Tue, 26 May 2026 00:36:48 +0200 Subject: [PATCH] Add monitoring.scrapeAnnotations to control prometheus.io/* Service annotations When monitoring.enabled is true (or auto with CRDs installed), the chart creates ServiceMonitor CRDs that instruct the Prometheus Operator to scrape CloudZero Agent metrics. In that configuration, the prometheus.io/* annotations on Services become redundant. In clusters where both annotation-based and CRD-based discovery are active simultaneously, metrics may be scraped twice, generating unnecessary load on the metrics pipeline. Adds components.monitoring.scrapeAnnotations (default: true) to address this. When set to false, prometheus.io/* annotations are omitted from the agent, aggregator, and webhook Services, eliminating the risk of double scraping while keeping ServiceMonitor-based discovery intact. Updates monitoring-infrastructure.md to document the new flag and the double scraping scenario, and adds helm unit tests covering both true and false states for all three affected Services. The default value of true preserves full backward compatibility for users relying on annotation-based Prometheus discovery without the Operator. This approach is purely additive, users without ServiceMonitors are unaffected, while Operator users gain a clean way to eliminate redundant scrape targets. A future consideration would be auto-disabling annotations when monitoring.enabled is true, though that would be a breaking change requiring a major version bump. --- helm/docs/monitoring-infrastructure.md | 30 ++++++++++---- helm/templates/agent-service.yaml | 6 ++- helm/templates/aggregator-service.yaml | 6 ++- helm/templates/webhook-service.yaml | 6 ++- helm/tests/defaults_service_test.yaml | 54 ++++++++++++++++++++++++++ helm/values.schema.json | 4 ++ helm/values.yaml | 20 +++++++++- tests/helm/template/alloy.yaml | 1 + 8 files changed, 114 insertions(+), 13 deletions(-) diff --git a/helm/docs/monitoring-infrastructure.md b/helm/docs/monitoring-infrastructure.md index d346d131d..df683b9f4 100644 --- a/helm/docs/monitoring-infrastructure.md +++ b/helm/docs/monitoring-infrastructure.md @@ -8,14 +8,20 @@ how they were validated. The chart provides two categories of monitoring integration: -1. **Prometheus `prometheus.io/*` annotations** on all Services (always enabled). - These allow standard Prometheus installations using `kubernetes_sd_configs` - to auto-discover and scrape CloudZero Agent metrics without any CRDs. +1. **Prometheus `prometheus.io/*` annotations** on all Services (enabled by + default, controlled by `components.monitoring.scrapeAnnotations`). These + allow standard Prometheus installations using `kubernetes_sd_configs` to + auto-discover and scrape CloudZero Agent metrics without any CRDs. 2. **Prometheus Operator CRDs** (opt-in via `components.monitoring.enabled`). When enabled, the chart creates `ServiceMonitor` and `PrometheusRule` resources that the Prometheus Operator automatically picks up. +When both are active simultaneously, Prometheus deployments that honor both +annotation-based discovery and ServiceMonitors may scrape each target twice. +Set `components.monitoring.scrapeAnnotations: false` to disable the annotations +when using ServiceMonitors. + These resources are designed to be useful regardless of the customer's monitoring stack. The `ServiceMonitor` and `PrometheusRule` CRDs are the standard interoperability format understood by the Prometheus Operator, but @@ -32,6 +38,10 @@ components: # false = never install CRDs (default while feature is being validated) enabled: false + # true (default) = keep prometheus.io/* annotations on Services + # false = remove redundant annotations from Services + scrapeAnnotations: true + # Override namespace for CRDs (default: same as agent namespace) namespace: "" @@ -288,11 +298,15 @@ Validated using multiple test scenarios on the `bach` cluster: Tested via `helm template` with all three modes: -| `components.monitoring.enabled` | ServiceMonitors | PrometheusRules | `prometheus.io/*` annotations | -| ------------------------------- | --------------- | --------------- | ----------------------------- | -| `null` (no CRDs in cluster) | 0 | 0 | 3 (always) | -| `true` | 4 | 1 | 3 (always) | -| `false` | 0 | 0 | 3 (always) | +| `components.monitoring.enabled` | ServiceMonitors | PrometheusRules | `prometheus.io/*` annotations | +| ------------------------------- | --------------- | --------------- | ----------------------------------------- | +| `null` (no CRDs in cluster) | 0 | 0 | 3 | +| `true` | 4 | 1 | 3 | +| `false` | 0 | 0 | 3 | + + Annotation count assumes `components.monitoring.scrapeAnnotations: true` +(default). Set to `false` to omit annotations, e.g. when `enabled` is `true` or +`"auto"` to avoid duplicate scraping. ### Test Suite diff --git a/helm/templates/agent-service.yaml b/helm/templates/agent-service.yaml index eff24e69f..ed3037ade 100644 --- a/helm/templates/agent-service.yaml +++ b/helm/templates/agent-service.yaml @@ -11,11 +11,15 @@ metadata: .Values.commonMetaLabels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if not (eq .Values.components.monitoring.scrapeAnnotations false) -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations - (dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics") + $promAnnotations ) ) | nindent 2 }} spec: diff --git a/helm/templates/aggregator-service.yaml b/helm/templates/aggregator-service.yaml index be32b42ba..8fac45148 100644 --- a/helm/templates/aggregator-service.yaml +++ b/helm/templates/aggregator-service.yaml @@ -12,12 +12,16 @@ metadata: .Values.components.aggregator.labels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if not (eq .Values.components.monitoring.scrapeAnnotations false) -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations .Values.components.aggregator.annotations - (dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics") + $promAnnotations ) ) | nindent 2 }} spec: diff --git a/helm/templates/webhook-service.yaml b/helm/templates/webhook-service.yaml index b3703de4d..c208a36d0 100644 --- a/helm/templates/webhook-service.yaml +++ b/helm/templates/webhook-service.yaml @@ -11,13 +11,17 @@ metadata: .Values.components.webhookServer.labels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if not (eq .Values.components.monitoring.scrapeAnnotations false) -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations .Values.components.webhookServer.annotations (dict "nginx.ingress.kubernetes.io/ssl-redirect" "false") - (dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https") + $promAnnotations ) ) | nindent 2 }} namespace: {{ .Release.Namespace }} diff --git a/helm/tests/defaults_service_test.yaml b/helm/tests/defaults_service_test.yaml index 9aa083d58..7e7f4a59a 100644 --- a/helm/tests/defaults_service_test.yaml +++ b/helm/tests/defaults_service_test.yaml @@ -2,15 +2,18 @@ # # This test validates that Service resources properly inherit # defaults.labels and defaults.annotations from the chart's defaults section. +# Also tests monitoring.scrapeAnnotations controls prometheus.io/* annotations. # # Services only support metadata-level defaults (labels and annotations). # PodSpec defaults (affinity, tolerations, etc.) do not apply to Services. # # Templates tested: +# - agent-service.yaml # - aggregator-service.yaml # - webhook-service.yaml suite: defaults.* properties apply to Service resources templates: + - agent-service.yaml - aggregator-service.yaml - webhook-service.yaml tests: @@ -91,3 +94,54 @@ tests: - equal: path: metadata.annotations.test-defaults-annotation value: sentinel-value-annotation + + # ============================================================================ + # monitoring.scrapeAnnotations tests + # ============================================================================ + - it: should include prometheus.io annotations on agent-service by default + template: agent-service.yaml + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should omit prometheus.io annotations on agent-service when scrapeAnnotations is false + template: agent-service.yaml + set: + components.monitoring.scrapeAnnotations: false + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should include prometheus.io annotations on aggregator-service by default + template: aggregator-service.yaml + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should omit prometheus.io annotations on aggregator-service when scrapeAnnotations is false + template: aggregator-service.yaml + set: + components.monitoring.scrapeAnnotations: false + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should include prometheus.io annotations on webhook-service by default + template: webhook-service.yaml + set: + insightsController.enabled: true + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should omit prometheus.io annotations on webhook-service when scrapeAnnotations is false + template: webhook-service.yaml + set: + insightsController.enabled: true + components.monitoring.scrapeAnnotations: false + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] diff --git a/helm/values.schema.json b/helm/values.schema.json index 0f8afc47a..b23a586e7 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -6309,6 +6309,10 @@ } ] }, + "scrapeAnnotations": { + "default": true, + "type": "boolean" + }, "sharedSecret": { "default": false, "type": "boolean" diff --git a/helm/values.yaml b/helm/values.yaml index 47e1ad8b7..3d75bf4cd 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -859,11 +859,27 @@ components: # # To opt in now, set to "auto" or true. # - # Regardless of this setting, prometheus.io/* annotations are always added to - # Services for customers using standard Prometheus service discovery. + # By default, prometheus.io/* annotations are added to Services for customers + # using standard Prometheus service discovery. Set monitoring.scrapeAnnotations: + # false to disable them when using Prometheus Operator ServiceMonitors to avoid + # Prometheus scraping each target twice. monitoring: enabled: null + # Controls whether prometheus.io/* annotations are added to Services. + # + # Background: When monitoring.enabled is true, the chart creates + # ServiceMonitor CRDs that instruct the Prometheus Operator to scrape + # CloudZero Agent metrics. In that setup, the prometheus.io/* annotations on + # Services become redundant and in clusters where both annotation-based + # and CRD-based discovery are active, same metrics could be scraped twice. + # + # - true (default): Keep the prometheus.io/* annotations set on Services. + # This value ensures backward compatibility + # + # - false: Remove the redundant prometheus.io/* annotations from Services. + scrapeAnnotations: true + # Namespace override for PrometheusRule and ServiceMonitor CRDs. # null (default) = same namespace as the agent installation. # Some Prometheus Operator deployments require CRDs to be in a specific diff --git a/tests/helm/template/alloy.yaml b/tests/helm/template/alloy.yaml index 8ed6edc7f..0778c99c2 100644 --- a/tests/helm/template/alloy.yaml +++ b/tests/helm/template/alloy.yaml @@ -1104,6 +1104,7 @@ data: enabled: null labels: {} namespace: null + scrapeAnnotations: true sharedSecret: false prometheus: image: