diff --git a/.github/workflows/crds.yaml b/.github/workflows/crds.yaml index 376cfcf0a..83ac09ac6 100644 --- a/.github/workflows/crds.yaml +++ b/.github/workflows/crds.yaml @@ -16,14 +16,14 @@ jobs: name: docs steps: - name: Check out operator code - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: VictoriaMetrics/operator token: ${{ secrets.VM_BOT_GH_TOKEN }} path: __vm-operator-repo - name: Check out VM code - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: VictoriaMetrics/helm-charts ref: master @@ -31,7 +31,7 @@ jobs: path: __vm-charts-repo - name: Import GPG key - uses: crazy-max/ghaction-import-gpg@v6 + uses: crazy-max/ghaction-import-gpg@2dc316deee8e90f13e1a351ab510b4d5bc0c82cd # v7.0.0 id: import-gpg with: gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }} @@ -52,7 +52,7 @@ jobs: working-directory: __vm-charts-repo - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 + uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0 with: add-paths: charts commit-message: Automatic update operator crds from ${{ github.repository }}@${{ steps.update.outputs.SHORT_SHA }} diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 728003172..a47c2a063 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -20,12 +20,12 @@ jobs: url: https://docs.victoriametrics.com/operator steps: - name: Checkout operator repo - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: path: __vm-operator - name: Checkout docs repo - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: VictoriaMetrics/vmdocs ref: main @@ -33,7 +33,7 @@ jobs: path: __vm-docs - name: Setup Go - uses: actions/setup-go@v6 + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: '__vm-operator/go.mod' check-latest: true @@ -42,7 +42,7 @@ jobs: - name: Import GPG key id: import-gpg - uses: crazy-max/ghaction-import-gpg@v6 + uses: crazy-max/ghaction-import-gpg@2dc316deee8e90f13e1a351ab510b4d5bc0c82cd # v7.0.0 with: gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }} passphrase: ${{ secrets.VM_BOT_PASSPHRASE }} diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 70bac6e19..2cc4e4ad9 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -60,14 +60,14 @@ jobs: SAVED=$((AFTER-BEFORE)) echo "Saved $(formatByteCount $SAVED)" - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Prepare binary cache - uses: actions/cache@v4 + uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: ./bin key: binary - name: Setup Go - uses: actions/setup-go@v6 + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: "go.mod" check-latest: true @@ -75,7 +75,7 @@ jobs: id: go - name: Run Trivy vulnerability scanner in repo mode - uses: aquasecurity/trivy-action@master + uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0 with: scan-type: "fs" ignore-unfixed: true @@ -83,7 +83,7 @@ jobs: output: "trivy-results.sarif" - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v4 + uses: github/codeql-action/upload-sarif@c6f931105cb2c34c8f901cc885ba1e2e259cf745 # v4.34.0 with: sarif_file: "trivy-results.sarif" @@ -119,7 +119,7 @@ jobs: git fetch origin ${{ github.base_ref || 'master' }} BASE_REF=origin/${{ github.base_ref || 'master' }} TAG=${TAG} make test-e2e - name: Publish Test Report - uses: mikepenz/action-junit-report@v6 + uses: mikepenz/action-junit-report@49b2ca06f62aa7ef83ae6769a2179271e160d8e4 # v6.3.1 if: success() || failure() with: report_paths: 'report.xml' @@ -128,7 +128,7 @@ jobs: run: make allure-report - name: Archive Allure report if: github.event.pull_request.draft == false && failure() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: allure-report path: ./allure-report diff --git a/.github/workflows/operatorhub.yaml b/.github/workflows/operatorhub.yaml index f120c99a9..7e7c2c72b 100644 --- a/.github/workflows/operatorhub.yaml +++ b/.github/workflows/operatorhub.yaml @@ -15,6 +15,8 @@ jobs: update: name: Publish new OperatorHub release runs-on: ubuntu-latest + env: + CHANNEL: ${{ startsWith(github.event.workflow_run.head_branch, 'release') && 'stable' || 'beta' }} if: ${{ (github.event.workflow_run.conclusion == 'success' && ! contains(github.event.workflow_run.head_branch, '-')) || github.event_name == 'workflow_dispatch' && github.event.workflow_run.release != 'prereleased' }} strategy: matrix: @@ -33,7 +35,7 @@ jobs: GH_TOKEN: ${{ secrets.VM_BOT_GH_TOKEN }} - name: Check out OperatorHub operators repo fork - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: ${{ matrix.repo.upstream }} ref: main @@ -41,7 +43,7 @@ jobs: path: __operatorhub-repo - name: Import GPG key - uses: crazy-max/ghaction-import-gpg@v6 + uses: crazy-max/ghaction-import-gpg@2dc316deee8e90f13e1a351ab510b4d5bc0c82cd # v7.0.0 id: import-gpg with: gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }} @@ -50,7 +52,7 @@ jobs: git_commit_gpgsign: true workdir: __operatorhub-repo - - uses: dawidd6/action-download-artifact@v11 + - uses: dawidd6/action-download-artifact@1f8785ff7a5130826f848e7f72725c85d241860f # v18 with: name: olm workflow: release.yaml @@ -58,14 +60,6 @@ jobs: run_id: ${{ github.event.workflow_run.id }} path: bundle - - name: Install opm - run: | - OPM_VERSION=v1.65.0 - curl -fsSLO https://github.com/operator-framework/operator-registry/releases/download/${OPM_VERSION}/linux-amd64-opm - curl -fsSLO https://github.com/operator-framework/operator-registry/releases/download/${OPM_VERSION}/checksums.txt - grep ' linux-amd64-opm$' checksums.txt | sha256sum -c - - install -m 0755 linux-amd64-opm /usr/local/bin/opm - - name: Add operatorhub bundle id: update run: | @@ -78,8 +72,12 @@ jobs: export OPERATOR_DIR=__operatorhub-repo/operators/${OPERATOR_NAME} mkdir -p ${OPERATOR_DIR} - NEW_VERSION=$(ls bundle | head -1) - export OLD_VERSION=$(find ${OPERATOR_DIR}/* ! -path "*/catalog-templates" -maxdepth 0 -type d -exec basename {} \; | sort -V -r | head -1) + export NEW_VERSION=$(ls bundle | head -1) + export OLD_VERSION=$( + { find ${OPERATOR_DIR}/* ! -path "*/catalog-templates" -maxdepth 0 -type d -exec basename {} \; + echo "${NEW_VERSION}" + } | sort -V | grep -B1 "^${NEW_VERSION}$" | grep -v "^${NEW_VERSION}$" + ) export OLD_ENTRY="${OPERATOR_NAME}.v${OLD_VERSION}" if [ ! -z $OLD_VERSION ]; then @@ -89,14 +87,16 @@ jobs: mv bundle/${NEW_VERSION} ${OPERATOR_DIR}/ if [ -f ${OPERATOR_DIR}/Makefile ]; then - opm render ${OPERATOR_DIR}/${NEW_VERSION} --output=yaml \ - | yq 'select(.schema == "olm.bundle")' > /tmp/new-bundle.yaml - + yq -n '.catalog_templates = []' > ${OPERATOR_DIR}/${NEW_VERSION}/release-config.yaml for TEMPLATE in ${OPERATOR_DIR}/catalog-templates/*.yaml; do - PREV_HEAD=$(yq '.entries[] | select(.schema == "olm.channel") | .entries[-1].name' "${TEMPLATE}") - NEW_VERSION="${NEW_VERSION}" PREV_HEAD="${PREV_HEAD}" \ - yq -i '(.entries[] | select(.schema == "olm.channel") | .entries) += [{"name": "victoriametrics-operator.v" + strenv(NEW_VERSION), "replaces": strenv(PREV_HEAD)}]' "${TEMPLATE}" - yq -i '.entries += [load("/tmp/new-bundle.yaml")]' "${TEMPLATE}" + export TPL=$(basename ${TEMPLATE}) + if [ "$CHANNEL" = "stable" ] && ! yq -e '.entries[] | select(.schema == "olm.channel" and .name == "stable")' "$TEMPLATE" > /dev/null 2>&1 && [ -n "$OLD_VERSION" ]; then + yq -i '.catalog_templates += [{"template_name": strenv(TPL), "channels": ["stable","beta"], "replaces": strenv(OLD_ENTRY)}]' ${OPERATOR_DIR}/${NEW_VERSION}/release-config.yaml + elif [ -n "$OLD_VERSION" ]; then + yq -i '.catalog_templates += [{"template_name": strenv(TPL), "channels": [strenv(CHANNEL)], "replaces": strenv(OLD_ENTRY)}]' ${OPERATOR_DIR}/${NEW_VERSION}/release-config.yaml + else + yq -i '.catalog_templates += [{"template_name": strenv(TPL), "channels": [strenv(CHANNEL)]}]' ${OPERATOR_DIR}/${NEW_VERSION}/release-config.yaml + fi done fi @@ -104,7 +104,7 @@ jobs: - name: Create Pull Request if: ${{ steps.update.outputs.VERSION != '' }} - uses: peter-evans/create-pull-request@v7 + uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0 with: add-paths: operators/victoriametrics-operator commit-message: 'victoriametrics-operator: ${{ steps.update.outputs.VERSION }}' diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 7f27e0853..2848cc1bb 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -14,14 +14,14 @@ jobs: pages: write steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Prepare binary cache - uses: actions/cache@v4 + uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4 with: path: ./bin key: binary - name: Setup Go - uses: actions/setup-go@v6 + uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0 with: go-version-file: 'go.mod' check-latest: true @@ -42,7 +42,7 @@ jobs: gh release upload ${{github.event.release.tag_name}} ./dist/install-no-webhook.yaml#install-no-webhook.yaml --clobber || echo "fix me NOT enough security permissions" gh release upload ${{github.event.release.tag_name}} ./dist/install-with-webhook.yaml#install-with-webhook.yaml --clobber || echo "fix me NOT enough security permissions" gh release upload ${{github.event.release.tag_name}} ./config/crd/overlay/crd.yaml#crd.yaml --clobber || echo "fix me NOT enough security permissions" - - uses: actions/upload-artifact@v5 + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: olm path: bundle diff --git a/.github/workflows/sandbox.yaml b/.github/workflows/sandbox.yaml index 551e28313..5a4d772a8 100644 --- a/.github/workflows/sandbox.yaml +++ b/.github/workflows/sandbox.yaml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout operator - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: VictoriaMetrics/operator ref: ${{ github.event.inputs.branch }} @@ -40,7 +40,7 @@ jobs: TAG=$IMAGE_TAG make docker-push - name: Checkout ops - uses: actions/checkout@v5 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: VictoriaMetrics/ops ref: main @@ -48,7 +48,7 @@ jobs: path: __vm-ops-repo - name: Import GPG key - uses: crazy-max/ghaction-import-gpg@v6 + uses: crazy-max/ghaction-import-gpg@2dc316deee8e90f13e1a351ab510b4d5bc0c82cd # v7.0.0 id: import-gpg with: gpg_private_key: ${{ secrets.VM_BOT_GPG_PRIVATE_KEY }} @@ -66,7 +66,7 @@ jobs: working-directory: __vm-ops-repo - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 + uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0 with: add-paths: ${{ steps.update.outputs.OPERATOR_PATH }} commit-message: Automatic update operator version on sandbox from ${{ github.repository }}@${{ env.IMAGE_TAG }} diff --git a/.github/workflows/upgrade-tests.yaml b/.github/workflows/upgrade-tests.yaml index 13795c1e3..02727a8fa 100644 --- a/.github/workflows/upgrade-tests.yaml +++ b/.github/workflows/upgrade-tests.yaml @@ -26,14 +26,14 @@ jobs: SAVED=$((AFTER-BEFORE)) echo "Saved $(formatByteCount $SAVED)" - name: Checkout code - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Prepare binary cache - uses: actions/cache@v5 + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: ./bin key: binary - name: Setup Go - uses: actions/setup-go@v6 + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 with: go-version-file: "go.mod" check-latest: true @@ -55,7 +55,7 @@ jobs: sudo apt install -y libgpgme-dev TAG=${TAG} make test-e2e-upgrade - name: Publish Test Report - uses: mikepenz/action-junit-report@v6 + uses: mikepenz/action-junit-report@bccf2e31636835cf0874589931c4116687171386 # v6.4.0 if: success() || failure() with: report_paths: 'report.xml' @@ -64,7 +64,7 @@ jobs: run: make allure-report - name: Archive Allure report if: failure() - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: allure-report path: ./allure-report diff --git a/Makefile b/Makefile index ef3832589..6fb9ba378 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ TAG ?= $(shell echo $$(git describe --long --all | tr '/' '-')$$( \ git diff-index --quiet HEAD -- || echo '-dirty-'$$( \ git diff-index -u HEAD -- ':!config' ':!docs' | openssl sha1 | cut -d' ' -f2 | cut -c 1-8))) OPERATOR_IMAGE ?= $(REGISTRY)/$(ORG)/$(REPO):$(TAG) +CONFIG_RELOADER_IMAGE ?= $(REGISTRY)/$(ORG)/$(REPO):config-reloader-$(TAG) VERSION ?= $(if $(findstring $(TAG),$(TAG:v%=%)),0.0.0,$(TAG:v%=%)) DATEINFO_TAG ?= $(shell date -u +'%Y%m%d-%H%M%S') NAMESPACE ?= vm @@ -162,7 +163,7 @@ test: manifests generate fmt vet envtest ## Run tests. # Utilize Kind or modify the e2e tests to load the image locally, enabling compatibility with other vendors. .PHONY: test-e2e # Run the e2e tests against a Kind k8s instance that is spun up. test-e2e: load-kind ginkgo crust-gather mirrord - env CGO_ENABLED=1 OPERATOR_IMAGE=$(OPERATOR_IMAGE) REPORTS_DIR=$(shell pwd) CRUST_GATHER_BIN=$(CRUST_GATHER_BIN) $(MIRRORD_BIN) exec -f ./mirrord.json -- $(GINKGO_BIN) \ + env CGO_ENABLED=1 OPERATOR_IMAGE=$(OPERATOR_IMAGE) CONFIG_RELOADER_IMAGE=$(CONFIG_RELOADER_IMAGE) REPORTS_DIR=$(shell pwd) CRUST_GATHER_BIN=$(CRUST_GATHER_BIN) $(MIRRORD_BIN) exec -f ./mirrord.json -- $(GINKGO_BIN) \ -ldflags="-linkmode=external" \ --output-interceptor-mode=none \ -procs=$(E2E_TESTS_CONCURRENCY) \ @@ -210,6 +211,10 @@ docker-build: ## Build docker image with the manager. ${DOCKER_BUILD_ARGS} \ -t $(REGISTRY)/$(ORG)/$(REPO):$(TAG) . +.PHONY: docker-build-config-reloader +docker-build-config-reloader: ## Build docker image with config-reloader. + TAG=config-reloader-$(TAG) COMPONENT=config-reloader ROOT=./cmd/config-reloader $(MAKE) docker-build + build-operator: ROOT=./cmd build-operator: build @@ -318,13 +323,18 @@ undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/. $(KUSTOMIZE) build $(OVERLAY) | $(KUBECTL) delete $(if $(NAMESPACE),-n $(NAMESPACE),) --ignore-not-found=$(ignore-not-found) -f - # builds image and loads it into kind. -load-kind: docker-build kind +ensure-kind-cluster: kind if [ "`$(KIND) get clusters`" != "kind" ]; then \ $(KIND) create cluster --config=./kind.yaml; \ else \ $(KUBECTL) cluster-info --context kind-kind; \ - fi; \ - $(KIND) load docker-image $(REGISTRY)/$(ORG)/$(REPO):$(TAG); \ + fi + +load-kind: docker-build docker-build-config-reloader ensure-kind-cluster + if [ "$(CONTAINER_TOOL)" != "podman" ]; then \ + $(KIND) load docker-image $(REGISTRY)/$(ORG)/$(REPO):$(TAG); \ + $(KIND) load docker-image $(CONFIG_RELOADER_IMAGE); \ + fi deploy-kind: OVERLAY=config/base-with-webhook deploy-kind: load-kind deploy @@ -458,7 +468,7 @@ $(MIRRORD_BIN): $(LOCALBIN) .PHONY: allure-report allure-report: - npx allure awesome --single-file ./allure-results -o ./allure-report + @[ -d ./allure-results ] && npx allure awesome --single-file ./allure-results -o ./allure-report || echo "allure-results dir not found, skipping report generation" # go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist # $1 - target path with name of binary (ideally with version) diff --git a/api/operator/v1/cluster_types_test.go b/api/operator/v1/cluster_types_test.go index 0b3e53871..1f2f3e42d 100644 --- a/api/operator/v1/cluster_types_test.go +++ b/api/operator/v1/cluster_types_test.go @@ -9,10 +9,11 @@ import ( vmv1beta1 "github.com/VictoriaMetrics/operator/api/operator/v1beta1" ) +//nolint:dupl func TestVTCluster_AvailableStorageNodeIDs(t *testing.T) { - f := func(cr *VTCluster, requestsType string, want []int32) { + f := func(cr *VTCluster, kind vmv1beta1.ClusterComponent, want []int32) { t.Helper() - assert.Equal(t, want, cr.AvailableStorageNodeIDs(requestsType)) + assert.Equal(t, want, cr.AvailableStorageNodeIDs(kind)) } cr := &VTCluster{ @@ -28,10 +29,10 @@ func TestVTCluster_AvailableStorageNodeIDs(t *testing.T) { } // select excludes maintenance nodes - f(cr, "select", []int32{0, 2, 4}) + f(cr, vmv1beta1.ClusterComponentSelect, []int32{0, 2, 4}) // insert excludes maintenance nodes - f(cr, "insert", []int32{1, 2, 3}) + f(cr, vmv1beta1.ClusterComponentInsert, []int32{1, 2, 3}) // no maintenance nodes f(&VTCluster{ @@ -40,13 +41,14 @@ func TestVTCluster_AvailableStorageNodeIDs(t *testing.T) { CommonAppsParams: vmv1beta1.CommonAppsParams{ReplicaCount: ptr.To(int32(3))}, }, }, - }, "select", []int32{0, 1, 2}) + }, vmv1beta1.ClusterComponentSelect, []int32{0, 1, 2}) } +//nolint:dupl func TestVLCluster_AvailableStorageNodeIDs(t *testing.T) { - f := func(cr *VLCluster, requestsType string, want []int32) { + f := func(cr *VLCluster, kind vmv1beta1.ClusterComponent, want []int32) { t.Helper() - assert.Equal(t, want, cr.AvailableStorageNodeIDs(requestsType)) + assert.Equal(t, want, cr.AvailableStorageNodeIDs(kind)) } cr := &VLCluster{ @@ -62,10 +64,10 @@ func TestVLCluster_AvailableStorageNodeIDs(t *testing.T) { } // select excludes maintenance nodes - f(cr, "select", []int32{0, 2, 4}) + f(cr, vmv1beta1.ClusterComponentSelect, []int32{0, 2, 4}) // insert excludes maintenance nodes - f(cr, "insert", []int32{1, 2, 3}) + f(cr, vmv1beta1.ClusterComponentInsert, []int32{1, 2, 3}) // no maintenance nodes f(&VLCluster{ @@ -74,5 +76,5 @@ func TestVLCluster_AvailableStorageNodeIDs(t *testing.T) { CommonAppsParams: vmv1beta1.CommonAppsParams{ReplicaCount: ptr.To(int32(3))}, }, }, - }, "select", []int32{0, 1, 2}) + }, vmv1beta1.ClusterComponentSelect, []int32{0, 1, 2}) } diff --git a/api/operator/v1/vlcluster_types.go b/api/operator/v1/vlcluster_types.go index 339918e38..702ca8e33 100644 --- a/api/operator/v1/vlcluster_types.go +++ b/api/operator/v1/vlcluster_types.go @@ -776,21 +776,27 @@ func (cr *VLCluster) Validate() error { } // AvailableStorageNodeIDs returns ids of the storage nodes for the provided component -func (cr *VLCluster) AvailableStorageNodeIDs(requestsType string) []int32 { +func (cr *VLCluster) AvailableStorageNodeIDs(kind vmv1beta1.ClusterComponent) []int32 { var result []int32 - if cr.Spec.VLStorage == nil || cr.Spec.VLStorage.ReplicaCount == nil { + if cr.Spec.VLStorage == nil || (cr.Spec.VLStorage.ReplicaCount == nil && cr.Spec.VLStorage.HPA == nil) { return result } maintenanceNodes := sets.New[int32]() - switch requestsType { - case "select": + switch kind { + case vmv1beta1.ClusterComponentSelect: maintenanceNodes.Insert(cr.Spec.VLStorage.MaintenanceSelectNodeIDs...) - case "insert": + case vmv1beta1.ClusterComponentInsert: maintenanceNodes.Insert(cr.Spec.VLStorage.MaintenanceInsertNodeIDs...) default: - panic("BUG unsupported requestsType: " + requestsType) + panic("BUG unsupported kind: " + string(kind)) + } + var replicaCount int32 + if cr.Spec.VLStorage.ReplicaCount != nil { + replicaCount = *cr.Spec.VLStorage.ReplicaCount + } else if cr.Spec.VLStorage.HPA != nil { + replicaCount = cr.Spec.VLStorage.HPA.GetMinReplicas() } - for i := int32(0); i < *cr.Spec.VLStorage.ReplicaCount; i++ { + for i := int32(0); i < replicaCount; i++ { if maintenanceNodes.Has(i) { continue } diff --git a/api/operator/v1/vmanomaly_types.go b/api/operator/v1/vmanomaly_types.go index 5dc3a09db..bfe29e9a8 100644 --- a/api/operator/v1/vmanomaly_types.go +++ b/api/operator/v1/vmanomaly_types.go @@ -115,6 +115,10 @@ type VMAnomalyWritersSpec struct { // Metrics to save the output (in metric names or labels) // +optional MetricFormat VMAnomalyVMWriterMetricFormatSpec `json:"metricFormat,omitempty" yaml:"metric_format,omitempty"` + // ConnectionRetryAttempts defines the number of attempts to retry the connection in case of failure + // +optional + // +kubebuilder:validation:Minimum=1 + ConnectionRetryAttempts int `json:"connectionRetryAttempts,omitempty" yaml:"connection_retry_attempts,omitempty"` // +optional VMAnomalyHTTPClientSpec `json:",inline,omitempty" yaml:",inline,omitempty"` } @@ -161,9 +165,12 @@ type VMAnomalyReadersSpec struct { QueryFromLastSeenTimestamp bool `json:"queryFromLastSeenTimestamp,omitempty" yaml:"query_from_last_seen_timestamp,omitempty"` // It allows overriding the default -search.latencyOffsetflag of VictoriaMetrics LatencyOffset string `json:"latencyOffset,omitempty" yaml:"latency_offset,omitempty"` + // Offset adds a time shift to the query window for all queries, e.g. to account for delayed data ingestion + // +optional + Offset string `json:"offset,omitempty" yaml:"offset,omitempty"` // Optional argoverrides how search.maxPointsPerTimeseries flagimpacts vmanomaly on splitting long fitWindow queries into smaller sub-intervals MaxPointsPerQuery int `json:"maxPointsPerQuery,omitempty" yaml:"max_points_per_query,omitempty"` - // Optional argumentspecifies the IANA timezone to account for local shifts, like DST, in models sensitive to seasonal patterns + // Optional argument specifies the IANA timezone to account for local shifts, like DST, in models sensitive to seasonal patterns Timezone string `json:"tz,omitempty" yaml:"tz,omitempty"` // Optional argumentallows defining valid data ranges for input of all the queries in queries DataRange []string `json:"dataRange,omitempty" yaml:"data_range,omitempty"` @@ -180,6 +187,11 @@ type VMAnomalyStatus struct { ParsingSpecError string `json:"-" yaml:"-"` } +// SetLastSpec implements objectWithLastAppliedState interface +func (cr *VMAnomaly) SetLastSpec(prevSpec VMAnomalySpec) { + cr.ParsedLastAppliedSpec = &prevSpec +} + // GetStatusMetadata returns metadata for object status func (cr *VMAnomaly) GetStatusMetadata() *vmv1beta1.StatusMetadata { return &cr.Status.StatusMetadata @@ -205,9 +217,8 @@ type VMAnomaly struct { Spec VMAnomalySpec `json:"spec,omitempty"` // ParsedLastAppliedSpec contains last-applied configuration spec - ParsedLastAppliedSpec *VMAnomalySpec `json:"-" yaml:"-"` - - Status VMAnomalyStatus `json:"status,omitempty"` + ParsedLastAppliedSpec *VMAnomalySpec `json:"-" yaml:"-"` + Status VMAnomalyStatus `json:"status,omitempty"` } // VMAnomalyMonitoringSpec defines configuration for VMAnomaly monitoring @@ -218,7 +229,7 @@ type VMAnomalyMonitoringSpec struct { } // VMAnomalyMonitoringPullSpec defines pull monitoring configuration -// which is enabled by default and served at POD_IP:8490/metrics +// which is enabled by default and served at POD_IP:8080/metrics type VMAnomalyMonitoringPullSpec struct { // Port defines a port for metrics scrape Port string `json:"port"` @@ -253,16 +264,14 @@ type VMAnomalyServerSpec struct { // MaxConcurrentTasks defines maximum number of concurrent anomaly detection tasks // +optional // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=20 MaxConcurrentTasks int `json:"maxConcurrentTasks,omitempty" yaml:"max_concurrent_tasks,omitempty"` // UIDefaultState defines default query state for anomaly UI // +optional UIDefaultState string `json:"uiDefaultState,omitempty" yaml:"ui_default_state,omitempty"` -} - -// SetLastSpec implements objectWithLastAppliedState interface -func (cr *VMAnomaly) SetLastSpec(prevSpec VMAnomalySpec) { - cr.ParsedLastAppliedSpec = &prevSpec + // UseReaderConnectionSettings when set to true, anomaly UI reuses connection settings + // (credentials, TLS, etc.) from the reader configuration to connect to datasources + // +optional + UseReaderConnectionSettings bool `json:"useReaderConnectionSettings,omitempty" yaml:"use_reader_connection_settings,omitempty"` } // AsOwner returns owner references with current object as owner @@ -304,11 +313,7 @@ func (cr *VMAnomaly) GetStatus() *VMAnomalyStatus { // DefaultStatusFields implements reconcile.ObjectWithDeepCopyAndStatus interface func (cr *VMAnomaly) DefaultStatusFields(vs *VMAnomalyStatus) { - var shardCnt int32 - if cr.IsSharded() { - shardCnt = *cr.Spec.ShardCount - } - vs.Shards = shardCnt + vs.Shards = cr.GetShardCount() } // UnmarshalJSON implements json.Unmarshaler interface @@ -407,7 +412,10 @@ func (cr *VMAnomaly) GetServiceScrape() *vmv1beta1.VMServiceScrapeSpec { // Port returns port for accessing anomaly UI func (cr *VMAnomaly) Port() string { - return cr.Spec.Port + if cr == nil || cr.Spec.Server == nil || len(cr.Spec.Server.Port) == 0 { + return "8490" + } + return cr.Spec.Server.Port } // GetVolumeName returns volume name for persistent storage @@ -438,7 +446,10 @@ func (cr *VMAnomaly) ProbeScheme() string { // ProbePort implements build.probeCRD interface func (cr *VMAnomaly) ProbePort() string { - return cr.Port() + if cr == nil || cr.Spec.Monitoring == nil || cr.Spec.Monitoring.Pull == nil || len(cr.Spec.Monitoring.Pull.Port) == 0 { + return "8080" + } + return cr.Spec.Monitoring.Pull.Port } // ProbeNeedLiveness implements build.probeCRD interface diff --git a/api/operator/v1/vtcluster_types.go b/api/operator/v1/vtcluster_types.go index 023649f88..b106399ca 100644 --- a/api/operator/v1/vtcluster_types.go +++ b/api/operator/v1/vtcluster_types.go @@ -687,21 +687,27 @@ func (cr *VTCluster) Validate() error { } // AvailableStorageNodeIDs returns ids of the storage nodes for the provided component -func (cr *VTCluster) AvailableStorageNodeIDs(requestsType string) []int32 { +func (cr *VTCluster) AvailableStorageNodeIDs(kind vmv1beta1.ClusterComponent) []int32 { var result []int32 - if cr.Spec.Storage == nil || cr.Spec.Storage.ReplicaCount == nil { + if cr.Spec.Storage == nil || (cr.Spec.Storage.ReplicaCount == nil && cr.Spec.Storage.HPA == nil) { return result } maintenanceNodes := sets.New[int32]() - switch requestsType { - case "select": + switch kind { + case vmv1beta1.ClusterComponentSelect: maintenanceNodes.Insert(cr.Spec.Storage.MaintenanceSelectNodeIDs...) - case "insert": + case vmv1beta1.ClusterComponentInsert: maintenanceNodes.Insert(cr.Spec.Storage.MaintenanceInsertNodeIDs...) default: - panic("BUG unsupported requestsType: " + requestsType) + panic("BUG unsupported kind: " + string(kind)) + } + var replicaCount int32 + if cr.Spec.Storage.ReplicaCount != nil { + replicaCount = *cr.Spec.Storage.ReplicaCount + } else if cr.Spec.Storage.HPA != nil { + replicaCount = cr.Spec.Storage.HPA.GetMinReplicas() } - for i := int32(0); i < *cr.Spec.Storage.ReplicaCount; i++ { + for i := int32(0); i < replicaCount; i++ { if maintenanceNodes.Has(i) { continue } diff --git a/api/operator/v1beta1/vmcluster_types.go b/api/operator/v1beta1/vmcluster_types.go index d4b2b1dad..f404f15a5 100644 --- a/api/operator/v1beta1/vmcluster_types.go +++ b/api/operator/v1beta1/vmcluster_types.go @@ -715,21 +715,27 @@ func (cr *VMCluster) Validate() error { } // AvailableStorageNodeIDs returns ids of the storage nodes for the provided component -func (cr *VMCluster) AvailableStorageNodeIDs(requestsType string) []int32 { +func (cr *VMCluster) AvailableStorageNodeIDs(kind ClusterComponent) []int32 { var result []int32 - if cr.Spec.VMStorage == nil || cr.Spec.VMStorage.ReplicaCount == nil { + if cr.Spec.VMStorage == nil || (cr.Spec.VMStorage.ReplicaCount == nil && cr.Spec.VMStorage.HPA == nil) { return result } maintenanceNodes := sets.New[int32]() - switch requestsType { - case "select": + switch kind { + case ClusterComponentSelect: maintenanceNodes.Insert(cr.Spec.VMStorage.MaintenanceSelectNodeIDs...) - case "insert": + case ClusterComponentInsert: maintenanceNodes.Insert(cr.Spec.VMStorage.MaintenanceInsertNodeIDs...) default: - panic("BUG unsupported requestsType: " + requestsType) + panic("BUG unsupported kind: " + string(kind)) + } + var replicaCount int32 + if cr.Spec.VMStorage.ReplicaCount != nil { + replicaCount = *cr.Spec.VMStorage.ReplicaCount + } else if cr.Spec.VMStorage.HPA != nil { + replicaCount = cr.Spec.VMStorage.HPA.GetMinReplicas() } - for i := int32(0); i < *cr.Spec.VMStorage.ReplicaCount; i++ { + for i := int32(0); i < replicaCount; i++ { if maintenanceNodes.Has(i) { continue } diff --git a/api/operator/v1beta1/vmcluster_types_test.go b/api/operator/v1beta1/vmcluster_types_test.go index 032f8f591..8cbace183 100644 --- a/api/operator/v1beta1/vmcluster_types_test.go +++ b/api/operator/v1beta1/vmcluster_types_test.go @@ -89,9 +89,9 @@ func TestVMBackup_SnapshotCreatePathWithFlags(t *testing.T) { } func TestVMCluster_AvailableStorageNodeIDs(t *testing.T) { - f := func(cr *VMCluster, requestsType string, want []int32) { + f := func(cr *VMCluster, kind ClusterComponent, want []int32) { t.Helper() - assert.Equal(t, want, cr.AvailableStorageNodeIDs(requestsType)) + assert.Equal(t, want, cr.AvailableStorageNodeIDs(kind)) } cr := &VMCluster{ @@ -107,10 +107,10 @@ func TestVMCluster_AvailableStorageNodeIDs(t *testing.T) { } // select excludes maintenance nodes - f(cr, "select", []int32{0, 2, 4}) + f(cr, ClusterComponentSelect, []int32{0, 2, 4}) // insert excludes maintenance nodes - f(cr, "insert", []int32{1, 2, 3}) + f(cr, ClusterComponentInsert, []int32{1, 2, 3}) // no maintenance nodes f(&VMCluster{ @@ -119,5 +119,5 @@ func TestVMCluster_AvailableStorageNodeIDs(t *testing.T) { CommonAppsParams: CommonAppsParams{ReplicaCount: ptr.To(int32(3))}, }, }, - }, "select", []int32{0, 1, 2}) + }, ClusterComponentSelect, []int32{0, 1, 2}) } diff --git a/api/operator/v1beta1/vmextra_types.go b/api/operator/v1beta1/vmextra_types.go index 61642e1d2..92c0599ff 100644 --- a/api/operator/v1beta1/vmextra_types.go +++ b/api/operator/v1beta1/vmextra_types.go @@ -435,6 +435,14 @@ type EmbeddedHPA struct { Behaviour *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behaviour,omitempty"` } +// GetMinReplicas returns default minReplicas value +func (cr *EmbeddedHPA) GetMinReplicas() int32 { + if cr.MinReplicas != nil { + return *cr.MinReplicas + } + return 1 +} + // Validate validates resource configuration func (cr *EmbeddedHPA) Validate() error { if cr.MinReplicas != nil && *cr.MinReplicas > cr.MaxReplicas { diff --git a/api/operator/v1beta1/vmrule_types.go b/api/operator/v1beta1/vmrule_types.go index b292a4905..7a2d98750 100644 --- a/api/operator/v1beta1/vmrule_types.go +++ b/api/operator/v1beta1/vmrule_types.go @@ -26,7 +26,11 @@ var initVMAlertTemplatesOnce sync.Once // VMRuleSpec defines the desired state of VMRule type VMRuleSpec struct { // Groups list of group rules - Groups []RuleGroup `json:"groups"` + // +patchMergeKey=name + // +patchStrategy=merge + // +listType=map + // +listMapKey=name + Groups []RuleGroup `json:"groups" patchStrategy:"merge" patchMergeKey:"name"` } // RuleGroup is a list of sequentially evaluated recording and alerting rules. diff --git a/cmd/config-reloader/file_watch.go b/cmd/config-reloader/file_watch.go index 915ae1436..0074c2606 100644 --- a/cmd/config-reloader/file_watch.go +++ b/cmd/config-reloader/file_watch.go @@ -87,7 +87,7 @@ func (fw *fileWatcher) start(ctx context.Context, updates chan struct{}) { case <-t.C: if err := update(*configFileName); err != nil { logger.Errorf("cannot update file at force resync :%s", err) - contentUpdateErrosTotal.Inc() + contentUpdateErrorsTotal.Inc() continue } case event := <-fw.w.Events: @@ -98,7 +98,7 @@ func (fw *fileWatcher) start(ctx context.Context, updates chan struct{}) { logger.Infof("changed: %s, %s", event.Name, event.Op.String()) if err := update(*configFileName); err != nil { logger.Errorf("cannot update file :%s", err) - contentUpdateErrosTotal.Inc() + contentUpdateErrorsTotal.Inc() continue } } @@ -153,11 +153,10 @@ func (dw *dirWatcher) start(ctx context.Context, updates chan struct{}) { } err = filepath.WalkDir(walkDir, func(path string, d fs.DirEntry, err error) error { - // hack for kubernetes configmaps and secrets. - // it uses ..YEAR_MONTH_DAY_HOUR.MIN.S directory for content updates - // and links it as a symlink - // just skip it, stat for the file will be evaluated with os.Stat below - if strings.Contains(path, "..") { + // Kubernetes projected volumes expose hidden ..* entries such as + // ..data and timestamped symlink targets. Skip only those synthetic + // path elements, not regular files like "rules..yaml". + if strings.HasPrefix(filepath.Base(path), "..") { return nil } @@ -222,9 +221,6 @@ func (dw *dirWatcher) start(ctx context.Context, updates chan struct{}) { case <-ctx.Done(): return case event := <-dw.w.Events: - if event.Op == fsnotify.Remove { - continue - } baseDir := filepath.Dir(event.Name) logger.Infof("dir update: base dir: %s", baseDir) reloadNeeded, err := updateCache(baseDir) diff --git a/cmd/config-reloader/file_watch_test.go b/cmd/config-reloader/file_watch_test.go new file mode 100644 index 000000000..d38957558 --- /dev/null +++ b/cmd/config-reloader/file_watch_test.go @@ -0,0 +1,110 @@ +package main + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" +) + +func TestDirWatcherProcessesRegularFilesWithDoubleDotsInName(t *testing.T) { + dir := t.TempDir() + file := filepath.Join(dir, "rules..yaml") + if err := os.WriteFile(file, []byte("groups: []\n"), 0o644); err != nil { + t.Fatalf("failed to write initial file: %v", err) + } + + dw, err := newDirWatchers([]string{dir}) + if err != nil { + t.Fatalf("failed to create dir watcher: %v", err) + } + + updates := make(chan struct{}, 10) + ctx, cancel := context.WithCancel(context.Background()) + dw.start(ctx, updates) + defer dw.close() + defer cancel() + + if err := os.WriteFile(file, []byte("groups:\n- name: test\n"), 0o644); err != nil { + t.Fatalf("failed to update file: %v", err) + } + + select { + case <-updates: + case <-time.After(2 * time.Second): + t.Fatal("expected update after modifying a regular file containing '..' in its name") + } +} + +func TestDirWatcherSkipsKubernetesHiddenEntries(t *testing.T) { + dir := t.TempDir() + visibleFile := filepath.Join(dir, "rules.yaml") + if err := os.WriteFile(visibleFile, []byte("groups: []\n"), 0o644); err != nil { + t.Fatalf("failed to write visible file: %v", err) + } + hiddenFile := filepath.Join(dir, "..data") + if err := os.WriteFile(hiddenFile, []byte("v1\n"), 0o644); err != nil { + t.Fatalf("failed to write hidden file: %v", err) + } + + dw, err := newDirWatchers([]string{dir}) + if err != nil { + t.Fatalf("failed to create dir watcher: %v", err) + } + + updates := make(chan struct{}, 10) + ctx, cancel := context.WithCancel(context.Background()) + dw.start(ctx, updates) + defer dw.close() + defer cancel() + + if err := os.WriteFile(hiddenFile, []byte("v2\n"), 0o644); err != nil { + t.Fatalf("failed to update hidden file: %v", err) + } + + select { + case <-updates: + t.Fatal("did not expect update after modifying kubernetes hidden entry") + case <-time.After(300 * time.Millisecond): + } + + if err := os.WriteFile(visibleFile, []byte("groups:\n- name: test\n"), 0o644); err != nil { + t.Fatalf("failed to update visible file: %v", err) + } + + select { + case <-updates: + case <-time.After(2 * time.Second): + t.Fatal("expected update after modifying visible file") + } +} + +func TestDirWatcherTriggersUpdateWhenFileRemoved(t *testing.T) { + dir := t.TempDir() + file := filepath.Join(dir, "rules.yaml") + if err := os.WriteFile(file, []byte("groups: []\n"), 0o644); err != nil { + t.Fatalf("failed to write initial file: %v", err) + } + + dw, err := newDirWatchers([]string{dir}) + if err != nil { + t.Fatalf("failed to create dir watcher: %v", err) + } + + updates := make(chan struct{}, 10) + ctx, cancel := context.WithCancel(context.Background()) + dw.start(ctx, updates) + defer dw.close() + defer cancel() + + if err := os.Remove(file); err != nil { + t.Fatalf("failed to remove watched file: %v", err) + } + + select { + case <-updates: + case <-time.After(2 * time.Second): + t.Fatal("expected update after removing watched file") + } +} diff --git a/cmd/config-reloader/k8s_watch.go b/cmd/config-reloader/k8s_watch.go index ea4b5f3e4..fc5401ef7 100644 --- a/cmd/config-reloader/k8s_watch.go +++ b/cmd/config-reloader/k8s_watch.go @@ -72,16 +72,40 @@ func newKubernetesWatcher(ctx context.Context, secretName, namespace string) (*k syncChan := make(chan syncEvent, 10) if _, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { - s := obj.(*corev1.Secret) - syncChan <- syncEvent{op: "create", obj: s} + s, ok := secretFromEvent(obj) + if !ok { + logger.Errorf("cannot process create event for unexpected object type %T", obj) + return + } + select { + case syncChan <- syncEvent{op: "create", obj: s}: + default: + logger.Infof("syncChan full, dropping create event for secret: %s", s.Name) + } }, UpdateFunc: func(oldObj, newObj interface{}) { - s := newObj.(*corev1.Secret) - syncChan <- syncEvent{op: "update", obj: s} + s, ok := secretFromEvent(newObj) + if !ok { + logger.Errorf("cannot process update event for unexpected object type %T", newObj) + return + } + select { + case syncChan <- syncEvent{op: "update", obj: s}: + default: + logger.Infof("syncChan full, dropping update event for secret: %s", s.Name) + } }, DeleteFunc: func(obj interface{}) { - s := obj.(*corev1.Secret) - syncChan <- syncEvent{op: "delete", obj: s} + s, ok := secretFromEvent(obj) + if !ok { + logger.Errorf("cannot process delete event for unexpected object type %T", obj) + return + } + select { + case syncChan <- syncEvent{op: "delete", obj: s}: + default: + logger.Infof("syncChan full, dropping delete event for secret: %s", s.Name) + } }, }); err != nil { return nil, fmt.Errorf("cannot build eventHandler: %w", err) @@ -90,6 +114,21 @@ func newKubernetesWatcher(ctx context.Context, secretName, namespace string) (*k return &k8sWatcher{inf: inf, c: c, events: syncChan, namespace: namespace, secretName: secretName}, nil } +func secretFromEvent(obj interface{}) (*corev1.Secret, bool) { + switch s := obj.(type) { + case *corev1.Secret: + return s, true + case cache.DeletedFinalStateUnknown: + secret, ok := s.Obj.(*corev1.Secret) + return secret, ok + case *cache.DeletedFinalStateUnknown: + secret, ok := s.Obj.(*corev1.Secret) + return secret, ok + default: + return nil, false + } +} + var errNotModified = fmt.Errorf("file content not modified") func (k *k8sWatcher) load(ctx context.Context) error { @@ -124,11 +163,14 @@ func (k *k8sWatcher) start(ctx context.Context, updates chan struct{}) { return fmt.Errorf("cannot write file content to disk: %w", err) } prevContent = newData - time.Sleep(time.Second) + select { + case <-time.After(time.Second): + case <-ctx.Done(): + return ctx.Err() + } select { case updates <- struct{}{}: default: - } return nil } @@ -141,7 +183,11 @@ func (k *k8sWatcher) start(ctx context.Context, updates chan struct{}) { logger.Errorf("cannot update secret: %s", err) } - go k.inf.Run(ctx.Done()) + k.wg.Add(1) + go func() { + defer k.wg.Done() + k.inf.Run(ctx.Done()) + }() k.wg.Add(1) go func() { defer k.wg.Done() @@ -154,10 +200,10 @@ func (k *k8sWatcher) start(ctx context.Context, updates chan struct{}) { select { case <-t.C: if err := updateSecret(&lastSecret); err != nil { - if errors.Is(err, errNotModified) { + if errors.Is(err, errNotModified) || errors.Is(err, context.Canceled) { continue } - contentUpdateErrosTotal.Inc() + contentUpdateErrorsTotal.Inc() logger.Errorf("cannot force sync secret content: %s", err) } case item := <-k.events: @@ -166,10 +212,10 @@ func (k *k8sWatcher) start(ctx context.Context, updates chan struct{}) { logger.Infof("get k8s sync event type: %s, for secret: %s", item.op, item.obj.Name) if err := updateSecret(s); err != nil { - if errors.Is(err, errNotModified) { + if errors.Is(err, errNotModified) || errors.Is(err, context.Canceled) { continue } - contentUpdateErrosTotal.Inc() + contentUpdateErrorsTotal.Inc() logger.Errorf("cannot sync secret content: %s", err) } case <-ctx.Done(): diff --git a/cmd/config-reloader/k8s_watch_test.go b/cmd/config-reloader/k8s_watch_test.go new file mode 100644 index 000000000..f06a741e1 --- /dev/null +++ b/cmd/config-reloader/k8s_watch_test.go @@ -0,0 +1,48 @@ +package main + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/cache" +) + +func TestSecretFromEvent(t *testing.T) { + secret := &corev1.Secret{} + + t.Run("secret object", func(t *testing.T) { + got, ok := secretFromEvent(secret) + if !ok { + t.Fatal("expected secret object to be accepted") + } + if got != secret { + t.Fatal("expected original secret pointer to be returned") + } + }) + + t.Run("deleted final state unknown value", func(t *testing.T) { + got, ok := secretFromEvent(cache.DeletedFinalStateUnknown{Obj: secret}) + if !ok { + t.Fatal("expected tombstone value to be accepted") + } + if got != secret { + t.Fatal("expected tombstone secret pointer to be returned") + } + }) + + t.Run("deleted final state unknown pointer", func(t *testing.T) { + got, ok := secretFromEvent(&cache.DeletedFinalStateUnknown{Obj: secret}) + if !ok { + t.Fatal("expected tombstone pointer to be accepted") + } + if got != secret { + t.Fatal("expected tombstone secret pointer to be returned") + } + }) + + t.Run("unexpected type", func(t *testing.T) { + if _, ok := secretFromEvent("not-a-secret"); ok { + t.Fatal("expected unexpected object type to be rejected") + } + }) +} diff --git a/cmd/config-reloader/main.go b/cmd/config-reloader/main.go index b715c0d48..83f08e099 100644 --- a/cmd/config-reloader/main.go +++ b/cmd/config-reloader/main.go @@ -73,12 +73,12 @@ var ( ) var ( - configLastOkReloadTime = metrics.NewCounter(`configreloader_last_reload_success_timestamp_seconds`) - configLastReloadSuccess = metrics.NewCounter(`configreloader_last_reload_successful`) - configReloadErrorsTotal = metrics.NewCounter(`configreloader_last_reload_errors_total`) - configReloadsTotal = metrics.NewCounter(`configreloader_config_last_reload_total`) - k8sAPIWatchErrorsTotal = metrics.NewCounter(`configreloader_k8s_watch_errors_total`) - contentUpdateErrosTotal = metrics.NewCounter(`configreloader_secret_content_update_errors_total`) + configLastOkReloadTime = metrics.NewCounter(`configreloader_last_reload_success_timestamp_seconds`) + configLastReloadSuccess = metrics.NewCounter(`configreloader_last_reload_successful`) + configReloadErrorsTotal = metrics.NewCounter(`configreloader_last_reload_errors_total`) + configReloadsTotal = metrics.NewCounter(`configreloader_config_last_reload_total`) + k8sAPIWatchErrorsTotal = metrics.NewCounter(`configreloader_k8s_watch_errors_total`) + contentUpdateErrorsTotal = metrics.NewCounter(`configreloader_secret_content_update_errors_total`) ) func main() { @@ -251,30 +251,21 @@ func (c *cfgWatcher) start(ctx context.Context) { for { select { case <-c.updates: - go func() { - if *delayInterval > 0 { - t := time.NewTimer(*delayInterval) - defer t.Stop() - select { - case <-t.C: - case <-ctx.Done(): - return - } - } - if err := c.reloader(ctx); err != nil { - logger.Errorf("cannot trigger api reload: %s", err.Error()) - configLastReloadSuccess.Set(0) - configReloadErrorsTotal.Inc() - return - } - configLastReloadSuccess.Set(1) - configLastOkReloadTime.Set(uint64(time.Now().UnixMilli())) - logger.Infof("reload config ok.") - }() - case <-ctx.Done(): return } + if !c.waitDelay(ctx) { + return + } + if err := c.reloader(ctx); err != nil { + logger.Errorf("cannot trigger api reload: %s", err.Error()) + configLastReloadSuccess.Set(0) + configReloadErrorsTotal.Inc() + continue + } + configLastReloadSuccess.Set(1) + configLastOkReloadTime.Set(uint64(time.Now().Unix())) + logger.Infof("reload config ok.") } }() } @@ -283,6 +274,25 @@ func (c *cfgWatcher) close() { c.wg.Wait() } +func (c *cfgWatcher) waitDelay(ctx context.Context) bool { + if *delayInterval > 0 { + t := time.NewTimer(*delayInterval) + defer t.Stop() + select { + case <-t.C: + case <-ctx.Done(): + return false + } + } + for { + select { + case <-c.updates: + default: + return true + } + } +} + type watcher interface { load(ctx context.Context) error start(ctx context.Context, updates chan struct{}) diff --git a/cmd/config-reloader/main_test.go b/cmd/config-reloader/main_test.go index 8c0d8e0f0..b873dd0bf 100644 --- a/cmd/config-reloader/main_test.go +++ b/cmd/config-reloader/main_test.go @@ -1,8 +1,11 @@ package main import ( + "context" "flag" + "sync/atomic" "testing" + "time" ) func TestLogFormatAlias(t *testing.T) { @@ -31,3 +34,188 @@ func TestLogFormatAlias(t *testing.T) { // log-format is empty f("", "json", "json") } + +// TestCfgWatcherSignalSentOnce verifies that a burst of updates results in +// exactly one reloader call (channel drained before reload). +func TestCfgWatcherSignalSentOnce(t *testing.T) { + origDelay := *delayInterval + *delayInterval = 0 + defer func() { *delayInterval = origDelay }() + + var reloadCount atomic.Int64 + updates := make(chan struct{}, 10) + w := cfgWatcher{ + updates: updates, + reloader: func(_ context.Context) error { + reloadCount.Add(1) + return nil + }, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + w.start(ctx) + + for range 5 { + select { + case updates <- struct{}{}: + default: + } + } + + time.Sleep(100 * time.Millisecond) + cancel() + w.close() + + if got := reloadCount.Load(); got != 1 { + t.Fatalf("expected 1 reload call, got %d", got) + } +} + +// TestCfgWatcherDelayIntervalHonoured verifies that the reloader is not called +// before delayInterval elapses after an update signal. +func TestCfgWatcherDelayIntervalHonoured(t *testing.T) { + delay := 150 * time.Millisecond + origDelay := *delayInterval + *delayInterval = delay + defer func() { *delayInterval = origDelay }() + + var reloadCount atomic.Int64 + updates := make(chan struct{}, 10) + w := cfgWatcher{ + updates: updates, + reloader: func(_ context.Context) error { + reloadCount.Add(1) + return nil + }, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + w.start(ctx) + + updates <- struct{}{} + + // before delay elapses - no reload yet + time.Sleep(50 * time.Millisecond) + if got := reloadCount.Load(); got != 0 { + t.Fatalf("expected 0 reload calls before delay, got %d", got) + } + + // after delay elapses - exactly one reload + time.Sleep(200 * time.Millisecond) + if got := reloadCount.Load(); got != 1 { + t.Fatalf("expected 1 reload call after delay, got %d", got) + } +} + +// TestCfgWatcherDelayIntervalDebouncesUpdates verifies that multiple updates +// arriving within the delay window are coalesced into a single reload call. +func TestCfgWatcherDelayIntervalDebouncesUpdates(t *testing.T) { + delay := 150 * time.Millisecond + origDelay := *delayInterval + *delayInterval = delay + defer func() { *delayInterval = origDelay }() + + var reloadCount atomic.Int64 + updates := make(chan struct{}, 10) + w := cfgWatcher{ + updates: updates, + reloader: func(_ context.Context) error { + reloadCount.Add(1) + return nil + }, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + w.start(ctx) + + // send first signal then more signals before delay elapses + updates <- struct{}{} + time.Sleep(20 * time.Millisecond) + for range 4 { + select { + case updates <- struct{}{}: + default: + } + } + + // wait for delay + processing + time.Sleep(300 * time.Millisecond) + cancel() + w.close() + + if got := reloadCount.Load(); got != 1 { + t.Fatalf("expected 1 reload call for burst within delay window, got %d", got) + } +} + +// TestCfgWatcherDelayIntervalCancelledContext verifies that cancelling context +// during delay window prevents reloader from being called. +func TestCfgWatcherDelayIntervalCancelledContext(t *testing.T) { + delay := 500 * time.Millisecond + origDelay := *delayInterval + *delayInterval = delay + defer func() { *delayInterval = origDelay }() + + var reloadCount atomic.Int64 + updates := make(chan struct{}, 10) + w := cfgWatcher{ + updates: updates, + reloader: func(_ context.Context) error { + reloadCount.Add(1) + return nil + }, + } + + ctx, cancel := context.WithCancel(context.Background()) + w.start(ctx) + + updates <- struct{}{} + + // cancel before delay elapses + time.Sleep(50 * time.Millisecond) + cancel() + w.close() + + if got := reloadCount.Load(); got != 0 { + t.Fatalf("expected 0 reload calls after context cancel, got %d", got) + } +} + +func TestCfgWatcherSuccessTimestampUsesSeconds(t *testing.T) { + origDelay := *delayInterval + *delayInterval = 0 + defer func() { *delayInterval = origDelay }() + + configLastOkReloadTime.Set(0) + configLastReloadSuccess.Set(0) + + updates := make(chan struct{}, 1) + w := cfgWatcher{ + updates: updates, + reloader: func(_ context.Context) error { + return nil + }, + } + + ctx, cancel := context.WithCancel(context.Background()) + w.start(ctx) + start := time.Now().Unix() + + updates <- struct{}{} + time.Sleep(100 * time.Millisecond) + cancel() + w.close() + + got := int64(configLastOkReloadTime.Get()) + end := time.Now().Unix() + + if got < start || got > end { + t.Fatalf("expected success timestamp in unix seconds between %d and %d, got %d", start, end, got) + } + if configLastReloadSuccess.Get() != 1 { + t.Fatalf("expected reload success metric to be 1, got %d", configLastReloadSuccess.Get()) + } +} diff --git a/config/crd/overlay/crd.descriptionless.yaml b/config/crd/overlay/crd.descriptionless.yaml index bdf7e8d0d..6a3d2fb90 100644 --- a/config/crd/overlay/crd.descriptionless.yaml +++ b/config/crd/overlay/crd.descriptionless.yaml @@ -12148,6 +12148,8 @@ spec: type: string maxPointsPerQuery: type: integer + offset: + type: string queryFromLastSeenTimestamp: type: boolean queryRangePath: @@ -12317,7 +12319,6 @@ spec: addr: type: string maxConcurrentTasks: - maximum: 20 minimum: 1 type: integer pathPrefix: @@ -12326,6 +12327,8 @@ spec: type: string uiDefaultState: type: string + useReaderConnectionSettings: + type: boolean type: object serviceAccountName: type: string @@ -12641,6 +12644,9 @@ spec: type: object x-kubernetes-map-type: atomic type: object + connectionRetryAttempts: + minimum: 1 + type: integer datasourceURL: type: string healthPath: @@ -31762,6 +31768,9 @@ spec: - rules type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map required: - groups type: object diff --git a/config/crd/overlay/crd.yaml b/config/crd/overlay/crd.yaml index 6af1e81e7..f4f00fd4e 100644 --- a/config/crd/overlay/crd.yaml +++ b/config/crd/overlay/crd.yaml @@ -23287,7 +23287,7 @@ spec: pull: description: |- VMAnomalyMonitoringPullSpec defines pull monitoring configuration - which is enabled by default and served at POD_IP:8490/metrics + which is enabled by default and served at POD_IP:8080/metrics properties: port: description: Port defines a port for metrics scrape @@ -23816,6 +23816,10 @@ spec: flagimpacts vmanomaly on splitting long fitWindow queries into smaller sub-intervals type: integer + offset: + description: Offset adds a time shift to the query window for + all queries, e.g. to account for delayed data ingestion + type: string queryFromLastSeenTimestamp: description: If True, then query will be performed from the last seen timestamp for a given series. @@ -23988,8 +23992,8 @@ spec: type: string type: object tz: - description: Optional argumentspecifies the IANA timezone to account - for local shifts, like DST, in models sensitive to seasonal + description: Optional argument specifies the IANA timezone to + account for local shifts, like DST, in models sensitive to seasonal patterns type: string required: @@ -24122,7 +24126,6 @@ spec: maxConcurrentTasks: description: MaxConcurrentTasks defines maximum number of concurrent anomaly detection tasks - maximum: 20 minimum: 1 type: integer pathPrefix: @@ -24137,6 +24140,11 @@ spec: description: UIDefaultState defines default query state for anomaly UI type: string + useReaderConnectionSettings: + description: |- + UseReaderConnectionSettings when set to true, anomaly UI reuses connection settings + (credentials, TLS, etc.) from the reader configuration to connect to datasources + type: boolean type: object serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount @@ -24872,6 +24880,11 @@ spec: type: object x-kubernetes-map-type: atomic type: object + connectionRetryAttempts: + description: ConnectionRetryAttempts defines the number of attempts + to retry the connection in case of failure + minimum: 1 + type: integer datasourceURL: description: |- DatasourceURL defines remote write url for write requests @@ -65239,6 +65252,9 @@ spec: - rules type: object type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map required: - groups type: object diff --git a/config/manifests/bases/victoriametrics-operator.clusterserviceversion.yaml b/config/manifests/bases/victoriametrics-operator.clusterserviceversion.yaml index 3dbf2bee9..c996077cb 100644 --- a/config/manifests/bases/victoriametrics-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/victoriametrics-operator.clusterserviceversion.yaml @@ -22,7 +22,7 @@ metadata: operatorhub.io/ui-metadata-max-k8s-version: "1.30" operators.openshift.io/infrastructure-features: '[fips]' operators.operatorframework.io.bundle.channel.default.v1: beta - operators.operatorframework.io.bundle.channels.v1: beta + operators.operatorframework.io.bundle.channels.v1: beta,stable operators.operatorframework.io/builder: operator-sdk-v1.35.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v4 repository: https://github.com/VictoriaMetrics/operator diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index fbcbfe834..9ff316bf2 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -13,6 +13,22 @@ aliases: ## tip +* Dependency: [vmoperator](https://docs.victoriametrics.com/operator/): Updated default versions for VM apps to [v1.146.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.146.0) version +* Dependency: [vmoperator](https://docs.victoriametrics.com/operator/): Updated default versions for VL apps to [v1.51.0](https://github.com/VictoriaMetrics/VictoriaLogs/releases/tag/v1.51.0). +* Dependency: [vmoperator](https://docs.victoriametrics.com/operator/): Updated default versions for VT apps to [v0.9.3](https://github.com/VictoriaMetrics/VictoriaTraces/releases/tag/v0.9.3) version. + +* FEATURE: [vmoperator](https://docs.victoriametrics.com/operator/): add `victoriametrics_app=true` label to all metrics scraped by the operator. See [#2261](https://github.com/VictoriaMetrics/operator/issues/2261). + +* BUGFIX: [config-reloader](https://docs.victoriametrics.com/operator/): fix possible panic on Secret watch events when the informer's local cache fell out of sync and Kubernetes delivered a stale tombstone entry instead of the Secret object. The config-reloader now unwraps tombstones correctly and logs an error for any other unexpected types. +* BUGFIX: [vmoperator](https://docs.victoriametrics.com/operator/): update status currentRevision and currentReplicas for StatefulSet with OnDelete update strategy. See [#1242](https://github.com/VictoriaMetrics/operator/issues/1242). +* BUGFIX: [vmoperator](https://docs.victoriametrics.com/operator/): retry reconcile errors, that may lead to expanding state, before resource could hang in expanding state. +* BUGFIX: [vmcluster](https://docs.victoriametrics.com/operator/resources/vmcluster/), [vlcluster](https://docs.victoriametrics.com/operator/resources/vlcluster/) and [vtcluster](https://docs.victoriametrics.com/operator/resources/vtcluster/): when storage HPA was enabled, generated `-storageNode` flags could become incorrect after scaling, which could break expected routing to storage nodes; now the operator derives storage node count from the current StatefulSet state so generated flags stay correct during HPA-driven scaling. See [#2117](https://github.com/VictoriaMetrics/operator/issues/2117). +* BUGFIX: [vmoperator](https://docs.victoriametrics.com/operator/): update status currentRevision and currentReplicas for StatefulSet with OnDelete update strategy. See [#1242](https://github.com/VictoriaMetrics/operator/issues/1242). +* BUGFIX: [config-reloader](https://docs.victoriametrics.com/operator/): fix `configreloader_last_reload_success_timestamp_seconds` metric to report time in seconds instead of milliseconds. +* BUGFIX: [vmoperator](https://docs.victoriametrics.com/operator/): ignore `NotFound` errors, that may occur during attempt to update status on a missing resource. +* BUGFIX: [vmanomaly](https://docs.victoriametrics.com/operator/resources/vmanomaly/): pass the configured TLS CA bundle to the reader, writer and monitoring clients. Previously the CA was mounted as a volume but dropped during config generation, so a `tlsConfig` with only a CA produced no `verify_tls` reference to it; `insecureSkipVerify` is now also propagated correctly. +* BUGFIX: [config-reloader](https://docs.victoriametrics.com/operator/): fix missed reload for watched files whose names contain `..` (e.g. `rules..yaml`). Previously any path containing `..` was silently skipped; now only Kubernetes synthetic entries whose basename starts with `..` (e.g. `..data`) are ignored. See [#2253](https://github.com/VictoriaMetrics/operator/pull/2253). + ## [v0.68.5](https://github.com/VictoriaMetrics/operator/releases/tag/v0.68.5) **Release date:** 27 May 2026 diff --git a/docs/api.md b/docs/api.md index 967d79047..2d63f04c6 100644 --- a/docs/api.md +++ b/docs/api.md @@ -599,7 +599,7 @@ Appears in: [VMAnomalyMonitoringPushSpec](#vmanomalymonitoringpushspec), [VMAnom #### VMAnomalyMonitoringPullSpec VMAnomalyMonitoringPullSpec defines pull monitoring configuration -which is enabled by default and served at POD_IP:8490/metrics +which is enabled by default and served at POD_IP:8080/metrics Appears in: [VMAnomalyMonitoringSpec](#vmanomalymonitoringspec) @@ -655,13 +655,14 @@ Appears in: [VMAnomalySpec](#vmanomalyspec) | healthPath#
_string_ | _(Required)_
HealthPath defines absolute or relative URL address where to check availability of the remote webserver | | latencyOffset#
_string_ | _(Required)_
It allows overriding the default -search.latencyOffsetflag of VictoriaMetrics | | maxPointsPerQuery#
_integer_ | _(Required)_
Optional argoverrides how search.maxPointsPerTimeseries flagimpacts vmanomaly on splitting long fitWindow queries into smaller sub-intervals | +| offset#
_string_ | _(Optional)_
Offset adds a time shift to the query window for all queries, e.g. to account for delayed data ingestion | | queryFromLastSeenTimestamp#
_boolean_ | _(Required)_
If True, then query will be performed from the last seen timestamp for a given series. | | queryRangePath#
_string_ | _(Required)_
Performs PromQL/MetricsQL range query | | samplingPeriod#
_string_ | _(Required)_
Frequency of the points returned | | tenantID#
_string_ | _(Required)_
TenantID defines for VictoriaMetrics Cluster version only, tenants are identified by accountID, accountID:projectID or multitenant. | | timeout#
_string_ | _(Required)_
Timeout for the requests, passed as a string | | tlsConfig#
_[TLSConfig](#tlsconfig)_ | _(Required)_
TLSConfig defines tls connection configuration | -| tz#
_string_ | _(Required)_
Optional argumentspecifies the IANA timezone to account for local shifts, like DST, in models sensitive to seasonal patterns | +| tz#
_string_ | _(Required)_
Optional argument specifies the IANA timezone to account for local shifts, like DST, in models sensitive to seasonal patterns | #### VMAnomalyServerSpec @@ -677,6 +678,7 @@ Appears in: [VMAnomalySpec](#vmanomalyspec) | pathPrefix#
_string_ | _(Optional)_
PathPrefix defines optional URL path prefix for all HTTP routes
If set to 'my-app' or '/my-app', routes will be served under '/my-app/...' | | port#
_string_ | _(Optional)_
Port defines port to listen on | | uiDefaultState#
_string_ | _(Optional)_
UIDefaultState defines default query state for anomaly UI | +| useReaderConnectionSettings#
_boolean_ | _(Optional)_
UseReaderConnectionSettings when set to true, anomaly UI reuses connection settings
(credentials, TLS, etc.) from the reader configuration to connect to datasources | #### VMAnomalySpec @@ -766,6 +768,7 @@ Appears in: [VMAnomalySpec](#vmanomalyspec) | --- | --- | | basicAuth#
_[BasicAuth](#basicauth)_ | _(Required)_
Basic auth defines basic authorization configuration | | bearer#
_[BearerAuth](#bearerauth)_ | _(Required)_
BearerAuth defines authorization with Authorization: Bearer header | +| connectionRetryAttempts#
_integer_ | _(Optional)_
ConnectionRetryAttempts defines the number of attempts to retry the connection in case of failure | | datasourceURL#
_string_ | _(Required)_
DatasourceURL defines remote write url for write requests
provided endpoint must serve /api/v1/import path
vmanomaly joins datasourceURL + "/api/v1/import" | | healthPath#
_string_ | _(Required)_
HealthPath defines absolute or relative URL address where to check availability of the remote webserver | | metricFormat#
_[VMAnomalyVMWriterMetricFormatSpec](#vmanomalyvmwritermetricformatspec)_ | _(Optional)_
Metrics to save the output (in metric names or labels) | diff --git a/docs/env.md b/docs/env.md index bc6c9d297..1db604d37 100644 --- a/docs/env.md +++ b/docs/env.md @@ -1,9 +1,9 @@ | Environment variables | | --- | -| VM_METRICS_VERSION: `v1.144.0` # | -| VM_LOGS_VERSION: `v1.50.0` # | -| VM_ANOMALY_VERSION: `v1.29.3` # | -| VM_TRACES_VERSION: `v0.7.0` # | +| VM_METRICS_VERSION: `v1.146.0` # | +| VM_LOGS_VERSION: `v1.51.0` # | +| VM_ANOMALY_VERSION: `v1.29.6` # | +| VM_TRACES_VERSION: `v0.9.3` # | | VM_OPERATOR_VERSION: `v0.68.3` # | | VM_GATEWAY_API_ENABLED: `false` # | | VM_VPA_API_ENABLED: `false` # | diff --git a/docs/resources/vlagent.md b/docs/resources/vlagent.md index 2fb681a98..675de34f4 100644 --- a/docs/resources/vlagent.md +++ b/docs/resources/vlagent.md @@ -77,7 +77,7 @@ metadata: spec: image: repository: victoriametrics/vlagent - tag: v1.47.0 + tag: v1.51.0 pullPolicy: Always ``` @@ -91,7 +91,7 @@ metadata: spec: image: repository: victoriametrics/vlagent - tag: v1.47.0 + tag: v1.51.0 pullPolicy: Always imagePullSecrets: - name: my-repo-secret diff --git a/go.mod b/go.mod index 210400d00..c5ee4f80e 100644 --- a/go.mod +++ b/go.mod @@ -125,5 +125,3 @@ require ( ) replace github.com/VictoriaMetrics/operator/api => ./api - -replace github.com/caarlos0/env/v11 => github.com/AndrewChubatiuk/env/v11 v11.0.0-20260302065400-14d0354881b6 diff --git a/go.sum b/go.sum index e87561340..1ed8676aa 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/AndrewChubatiuk/env/v11 v11.0.0-20260302065400-14d0354881b6 h1:5CPOPjp7co7TgffUQ/jOVlw6IX8uHXDHt0W85Mwd7Zw= -github.com/AndrewChubatiuk/env/v11 v11.0.0-20260302065400-14d0354881b6/go.mod h1:qupehSf/Y0TUTsxKywqRt/vJjN5nz6vauiYEUUr8P4U= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/VictoriaMetrics/VictoriaLogs v1.36.2-0.20251008164716-21c0fb3de84d h1:fV15mhBCGpCCBbuOAbOflO8Air+tLklMt8bG35FimzQ= @@ -44,6 +42,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bmatcuk/doublestar/v4 v4.9.1 h1:X8jg9rRZmJd4yRy7ZeNDRnM+T3ZfHv15JiBJ/avrEXE= github.com/bmatcuk/doublestar/v4 v4.9.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= +github.com/caarlos0/env/v11 v11.3.1 h1:cArPWC15hWmEt+gWk7YBi7lEXTXCvpaSdCiZE2X5mCA= +github.com/caarlos0/env/v11 v11.3.1/go.mod h1:qupehSf/Y0TUTsxKywqRt/vJjN5nz6vauiYEUUr8P4U= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= diff --git a/internal/config/config.go b/internal/config/config.go index 2405fff36..c9e0bc484 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -35,10 +35,10 @@ var ( initConf sync.Once defaultEnvs = map[string]string{ - "VM_METRICS_VERSION": "v1.144.0", - "VM_LOGS_VERSION": "v1.50.0", - "VM_ANOMALY_VERSION": "v1.29.3", - "VM_TRACES_VERSION": "v0.7.0", + "VM_METRICS_VERSION": "v1.146.0", + "VM_LOGS_VERSION": "v1.51.0", + "VM_ANOMALY_VERSION": "v1.29.6", + "VM_TRACES_VERSION": "v0.9.3", "VM_OPERATOR_VERSION": getVersion("v0.68.3"), } ) diff --git a/internal/controller/operator/factory/build/defaults.go b/internal/controller/operator/factory/build/defaults.go index 8b21e2a97..667f8f82a 100644 --- a/internal/controller/operator/factory/build/defaults.go +++ b/internal/controller/operator/factory/build/defaults.go @@ -322,11 +322,19 @@ func addVMAnomalyDefaults(objI any) { } addDefaultsToCommonParams(&cr.Spec.CommonAppsParams, &cp, &cv) if cr.Spec.Monitoring == nil { - cr.Spec.Monitoring = &vmv1.VMAnomalyMonitoringSpec{ - Pull: &vmv1.VMAnomalyMonitoringPullSpec{ - Port: "8080", - }, - } + cr.Spec.Monitoring = &vmv1.VMAnomalyMonitoringSpec{} + } + if cr.Spec.Monitoring.Pull == nil { + cr.Spec.Monitoring.Pull = &vmv1.VMAnomalyMonitoringPullSpec{} + } + if len(cr.Spec.Monitoring.Pull.Port) == 0 { + cr.Spec.Monitoring.Pull.Port = "8080" + } + if cr.Spec.Server == nil { + cr.Spec.Server = &vmv1.VMAnomalyServerSpec{} + } + if len(cr.Spec.Server.Port) == 0 { + cr.Spec.Server.Port = cv.Port } } diff --git a/internal/controller/operator/factory/build/vmscrape.go b/internal/controller/operator/factory/build/vmscrape.go index b9ef3aac7..d6be44d93 100644 --- a/internal/controller/operator/factory/build/vmscrape.go +++ b/internal/controller/operator/factory/build/vmscrape.go @@ -127,6 +127,9 @@ func VMServiceScrape(service *corev1.Service, b scrapeBuilder, additionalPortNam }, } } + for i := range scrape.Spec.Endpoints { + addVictoriaMetricsAppRelabelConfig(&scrape.Spec.Endpoints[i].EndpointRelabelings) + } return scrape } @@ -155,7 +158,6 @@ func VMPodScrape(b podScrapeBuilder, portName string) *vmv1beta1.VMPodScrape { "authKey": {authKey}, } } - selectorLabels := b.SelectorLabels() scrape := &vmv1beta1.VMPodScrape{ ObjectMeta: metav1.ObjectMeta{ @@ -174,10 +176,9 @@ func VMPodScrape(b podScrapeBuilder, portName string) *vmv1beta1.VMPodScrape { serviceScrapeSpec := b.GetServiceScrape() if serviceScrapeSpec != nil { for _, e := range serviceScrapeSpec.Endpoints { - if e.Port == *endpoint.Port { - endpoint.EndpointAuth = e.EndpointAuth - endpoint.EndpointScrapeParams = e.EndpointScrapeParams - endpoint.EndpointRelabelings = e.EndpointRelabelings + if e.Port == *scrape.Spec.PodMetricsEndpoints[0].Port { + scrape.Spec.PodMetricsEndpoints[0].EndpointScrapeParams = e.EndpointScrapeParams + scrape.Spec.PodMetricsEndpoints[0].EndpointRelabelings = e.EndpointRelabelings continue } scrape.Spec.PodMetricsEndpoints = append(scrape.Spec.PodMetricsEndpoints, vmv1beta1.PodMetricsEndpoint{ @@ -192,5 +193,25 @@ func VMPodScrape(b podScrapeBuilder, portName string) *vmv1beta1.VMPodScrape { scrape.Spec.SeriesLimit = serviceScrapeSpec.SeriesLimit scrape.Spec.AttachMetadata = serviceScrapeSpec.AttachMetadata } + for i := range scrape.Spec.PodMetricsEndpoints { + addVictoriaMetricsAppRelabelConfig(&scrape.Spec.PodMetricsEndpoints[i].EndpointRelabelings) + } return scrape } + +func addVictoriaMetricsAppRelabelConfig(relabelings *vmv1beta1.EndpointRelabelings) { + for _, rc := range relabelings.RelabelConfigs { + if rc != nil && (rc.TargetLabel == "victoriametrics_app" || rc.UnderScoreTargetLabel == "victoriametrics_app") { + return + } + } + relabelings.RelabelConfigs = append(relabelings.RelabelConfigs, victoriaMetricsAppRelabelConfig()) +} + +func victoriaMetricsAppRelabelConfig() *vmv1beta1.RelabelConfig { + return &vmv1beta1.RelabelConfig{ + TargetLabel: "victoriametrics_app", + UnderScoreTargetLabel: "victoriametrics_app", + Replacement: ptr.To("true"), + } +} diff --git a/internal/controller/operator/factory/build/vmscrape_test.go b/internal/controller/operator/factory/build/vmscrape_test.go index 092f8777c..3e77f1603 100644 --- a/internal/controller/operator/factory/build/vmscrape_test.go +++ b/internal/controller/operator/factory/build/vmscrape_test.go @@ -8,6 +8,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + vmv1 "github.com/VictoriaMetrics/operator/api/operator/v1" vmv1beta1 "github.com/VictoriaMetrics/operator/api/operator/v1beta1" ) @@ -32,7 +33,24 @@ func (tb *testScrapeObject) GetExtraArgs() map[string]string { return tb.extraArgs } +func (tb *testScrapeObject) GetNamespace() string { + return "default" +} + +func (tb *testScrapeObject) PrefixedName() string { + return "test" +} + +func (tb *testScrapeObject) SelectorLabels() map[string]string { + return map[string]string{"app": "test"} +} + +func (tb *testScrapeObject) AsOwner() metav1.OwnerReference { + return metav1.OwnerReference{Name: "test"} +} + func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { + vmAppRelabel := []*vmv1beta1.RelabelConfig{victoriaMetricsAppRelabelConfig()} type opts struct { spec testScrapeObject service *corev1.Service @@ -67,6 +85,9 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { }, wantServiceScrapeSpec: vmv1beta1.VMServiceScrapeSpec{ Endpoints: []vmv1beta1.Endpoint{{ + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", }, @@ -101,6 +122,9 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { spec: testScrapeObject{}, wantServiceScrapeSpec: vmv1beta1.VMServiceScrapeSpec{ Endpoints: []vmv1beta1.Endpoint{{ + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", }, @@ -137,6 +161,9 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { wantServiceScrapeSpec: vmv1beta1.VMServiceScrapeSpec{ Endpoints: []vmv1beta1.Endpoint{ { + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", }, @@ -153,7 +180,7 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { TargetLabel: "job", Regex: vmv1beta1.StringOrArray{"(.+)"}, Replacement: ptr.To("${1}-vmbackup"), - }}, + }, victoriaMetricsAppRelabelConfig()}, }, }, }, @@ -188,6 +215,9 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { }, wantServiceScrapeSpec: vmv1beta1.VMServiceScrapeSpec{ Endpoints: []vmv1beta1.Endpoint{{ + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", }, @@ -245,12 +275,18 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { wantServiceScrapeSpec: vmv1beta1.VMServiceScrapeSpec{ Endpoints: []vmv1beta1.Endpoint{ { + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", }, Port: "sidecar", }, { + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", ScrapeInterval: "30s", @@ -295,6 +331,9 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { }, wantServiceScrapeSpec: vmv1beta1.VMServiceScrapeSpec{ Endpoints: []vmv1beta1.Endpoint{{ + EndpointRelabelings: vmv1beta1.EndpointRelabelings{ + RelabelConfigs: vmAppRelabel, + }, EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ Path: "/metrics", Params: map[string][]string{"authKey": {"some-access-key"}}, @@ -317,3 +356,95 @@ func TestVMServiceScrapeForServiceWithSpec(t *testing.T) { }, }) } + +func TestVMServiceScrapeAddsVictoriaMetricsAppLabel(t *testing.T) { + service := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: corev1.ServiceSpec{Ports: []corev1.ServicePort{ + {Name: "http"}, + {Name: "extra"}, + }}, + } + spec := testScrapeObject{serviceScrapeSpecTemplate: &vmv1beta1.VMServiceScrapeSpec{ + Endpoints: []vmv1beta1.Endpoint{ + {Port: "http"}, + {Port: "custom"}, + }, + }} + + scrape := VMServiceScrape(service, &spec, "extra") + + assert.Len(t, scrape.Spec.Endpoints, 3) + for i := range scrape.Spec.Endpoints { + assert.Contains(t, scrape.Spec.Endpoints[i].RelabelConfigs, victoriaMetricsAppRelabelConfig()) + } + +} + +func TestVMPodScrapeAddsVictoriaMetricsAppLabel(t *testing.T) { + spec := testScrapeObject{serviceScrapeSpecTemplate: &vmv1beta1.VMServiceScrapeSpec{ + Endpoints: []vmv1beta1.Endpoint{ + { + Port: "http", + EndpointScrapeParams: vmv1beta1.EndpointScrapeParams{ + Path: "/custom", + }, + }, + {Port: "extra"}, + }, + }} + + podScrape := VMPodScrape(&spec, "http") + + assert.Len(t, podScrape.Spec.PodMetricsEndpoints, 2) + assert.Equal(t, "/custom", podScrape.Spec.PodMetricsEndpoints[0].Path) + for i := range podScrape.Spec.PodMetricsEndpoints { + assert.Contains(t, podScrape.Spec.PodMetricsEndpoints[i].RelabelConfigs, victoriaMetricsAppRelabelConfig()) + } +} + +func TestVMServiceScrapeObjectsAddVictoriaMetricsAppLabel(t *testing.T) { + objectMeta := metav1.ObjectMeta{Name: "test", Namespace: "default"} + + f := func(name string, builder scrapeBuilder) { + service := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: corev1.ServiceSpec{Ports: []corev1.ServicePort{ + {Name: "http"}, + }}, + } + + scrape := VMServiceScrape(service, builder) + + assert.Len(t, scrape.Spec.Endpoints, 1) + assert.Contains(t, scrape.Spec.Endpoints[0].RelabelConfigs, victoriaMetricsAppRelabelConfig()) + } + f("VMSingle", &vmv1beta1.VMSingle{ObjectMeta: objectMeta}) + f("VMAlert", &vmv1beta1.VMAlert{ObjectMeta: objectMeta}) + f("VMAuth", &vmv1beta1.VMAuth{ObjectMeta: objectMeta}) + f("VMSelect", &vmv1beta1.VMSelect{}) + f("VMInsert", &vmv1beta1.VMInsert{}) + f("VMStorage", &vmv1beta1.VMStorage{}) + f("VLSingle", &vmv1.VLSingle{ObjectMeta: objectMeta}) + f("VLSelect", &vmv1.VLSelect{}) + f("VLInsert", &vmv1.VLInsert{}) + f("VLStorage", &vmv1.VLStorage{}) + f("VTSingle", &vmv1.VTSingle{ObjectMeta: objectMeta}) + f("VTSelect", &vmv1.VTSelect{}) + f("VTInsert", &vmv1.VTInsert{}) + f("VTStorage", &vmv1.VTStorage{}) +} + +func TestVMPodScrapeObjectsAddVictoriaMetricsAppLabel(t *testing.T) { + objectMeta := metav1.ObjectMeta{Name: "test", Namespace: "default"} + + f := func(builder podScrapeBuilder, port string) { + scrape := VMPodScrape(builder, port) + + assert.Len(t, scrape.Spec.PodMetricsEndpoints, 1) + assert.Contains(t, scrape.Spec.PodMetricsEndpoints[0].RelabelConfigs, victoriaMetricsAppRelabelConfig()) + } + f(&vmv1beta1.VMAgent{ObjectMeta: objectMeta}, "http") + f(&vmv1.VLAgent{ObjectMeta: objectMeta}, "http") + f(&vmv1.VMAnomaly{ObjectMeta: objectMeta}, "monitoring-http") +} diff --git a/internal/controller/operator/factory/reconcile/statefulset.go b/internal/controller/operator/factory/reconcile/statefulset.go index c46c26e8b..09c2e4f0d 100644 --- a/internal/controller/operator/factory/reconcile/statefulset.go +++ b/internal/controller/operator/factory/reconcile/statefulset.go @@ -212,13 +212,32 @@ type rollingUpdateOpts struct { delete bool } +// patchSTSCurrentRevision patches statefulset status.currentRevision to match status.updateRevision +// after all pods are updated. This is needed because Kubernetes does not update currentRevision +// for OnDelete strategy, leaving stale data visible to monitoring tools. +// See https://github.com/VictoriaMetrics/operator/issues/1242 +func patchSTSCurrentRevision(ctx context.Context, rclient client.Client, nsn types.NamespacedName, updateRevision string, replicas int32) error { + var sts appsv1.StatefulSet + if err := rclient.Get(ctx, nsn, &sts); err != nil { + return fmt.Errorf("cannot get statefulset for status update: %w", err) + } + if sts.Status.CurrentRevision == updateRevision { + return nil + } + logger.WithContext(ctx).Info(fmt.Sprintf("updating statefulset=%s/%s status currentRevision from %q to %q", nsn.Namespace, nsn.Name, sts.Status.CurrentRevision, updateRevision)) + sts.Status.CurrentRevision = updateRevision + // currentReplicas is not updated with OnDelete strategy too + sts.Status.CurrentReplicas = replicas + if err := rclient.Status().Update(ctx, &sts); err != nil { + return fmt.Errorf("cannot update statefulset=%s/%s currentRevision status: %w", nsn.Namespace, nsn.Name, err) + } + return nil +} + // we perform rolling update on sts by manually evicting pods one by one or in batches // we check sts revision (kubernetes controller-manager is responsible for that) // and compare pods revision label with sts revision // if it doesn't match - updated is needed -// -// we always check if sts.Status.CurrentRevision needs update, to keep it equal to UpdateRevision -// see https://github.com/kubernetes/kube-state-metrics/issues/1324#issuecomment-1779751992 func performRollingUpdateOnSts(ctx context.Context, rclient client.Client, obj *appsv1.StatefulSet, o rollingUpdateOpts) error { time.Sleep(podWaitReadyInterval) nsn := types.NamespacedName{ @@ -278,10 +297,10 @@ func performRollingUpdateOnSts(ctx context.Context, rclient client.Client, obj * return fmt.Errorf("actual pod count: %d less than needed: %d, possible statefulset misconfiguration", totalPodsCount, neededPodCount) } - updatedNeeded := len(podsForUpdate) != 0 || len(updatedPods) != 0 - if !updatedNeeded { + updateNeeded := len(podsForUpdate) != 0 || len(updatedPods) != 0 + if !updateNeeded { l.V(1).Info("no pod needs to be updated") - return nil + return patchSTSCurrentRevision(ctx, rclient, nsn, stsVersion, int32(neededPodCount)) } l.Info(fmt.Sprintf("discovered already updated pods=%d, pods needed to be update=%d", len(updatedPods), len(podsForUpdate))) @@ -355,7 +374,7 @@ func performRollingUpdateOnSts(ctx context.Context, rclient client.Client, obj * l.Info(fmt.Sprintf("finished statefulset update from revision=%q to revision=%q", sts.Status.CurrentRevision, stsVersion)) - return nil + return patchSTSCurrentRevision(ctx, rclient, nsn, stsVersion, int32(neededPodCount)) } // PodIsReady check is pod is ready diff --git a/internal/controller/operator/factory/reconcile/status.go b/internal/controller/operator/factory/reconcile/status.go index d269b96f8..32aa6e300 100644 --- a/internal/controller/operator/factory/reconcile/status.go +++ b/internal/controller/operator/factory/reconcile/status.go @@ -11,6 +11,7 @@ import ( "time" "k8s.io/apimachinery/pkg/api/equality" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" utilerrors "k8s.io/apimachinery/pkg/util/errors" @@ -87,6 +88,9 @@ func updateChildStatusConditions[T any, PT interface { return retryOnConflict(func() error { dst := PT(new(T)) if err := rclient.Get(ctx, nsn, dst); err != nil { + if k8serrors.IsNotFound(err) { + return nil + } return err } st := dst.GetStatusMetadata() @@ -98,6 +102,9 @@ func updateChildStatusConditions[T any, PT interface { writeAggregatedStatus(st, vmv1beta1.ConditionDomainTypeAppliedSuffix) if !reflect.DeepEqual(prevSt, st) { if err := rclient.Status().Update(ctx, dst); err != nil { + if k8serrors.IsNotFound(err) { + return nil + } return fmt.Errorf("failed to patch status of broken VMAlertmanagerConfig=%q: %w", childObject.GetName(), err) } } diff --git a/internal/controller/operator/factory/vlagent/vlagent_test.go b/internal/controller/operator/factory/vlagent/vlagent_test.go index 2bacd3917..b9bffbb23 100644 --- a/internal/controller/operator/factory/vlagent/vlagent_test.go +++ b/internal/controller/operator/factory/vlagent/vlagent_test.go @@ -949,7 +949,7 @@ serviceaccountname: vlagent-agent Spec: vmv1.VLAgentSpec{ CommonAppsParams: vmv1beta1.CommonAppsParams{ Image: vmv1beta1.Image{ - Tag: "v1.48.0", + Tag: "v1.51.0", }, UseDefaultResources: ptr.To(false), Port: "9425", @@ -975,7 +975,7 @@ serviceaccountname: vlagent-agent }, []runtime.Object{}, ` containers: - name: vlagent - image: victoriametrics/vlagent:v1.48.0 + image: victoriametrics/vlagent:v1.51.0 args: - -httpListenAddr=:9425 - -kubernetesCollector diff --git a/internal/controller/operator/factory/vlcluster/vlinsert.go b/internal/controller/operator/factory/vlcluster/vlinsert.go index bae883096..900d05893 100644 --- a/internal/controller/operator/factory/vlcluster/vlinsert.go +++ b/internal/controller/operator/factory/vlcluster/vlinsert.go @@ -79,6 +79,7 @@ func createOrUpdateVLInsertDeployment(ctx context.Context, rclient client.Client PatchSpec: func(existingSpec, newSpec *appsv1.DeploymentSpec) { if cr.Spec.VLInsert.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.VLInsert.ReplicaCount = existingSpec.Replicas } }, } @@ -138,16 +139,14 @@ func buildVLInsertPodSpec(cr *vmv1.VLCluster) (*corev1.PodTemplateSpec, error) { args = append(args, fmt.Sprintf("-loggerFormat=%s", cr.Spec.VLInsert.LogFormat)) } - if cr.Spec.VLStorage != nil && cr.Spec.VLStorage.ReplicaCount != nil { - storageNodeFlag := build.NewFlag("-storageNode", "") - storageNodeIds := cr.AvailableStorageNodeIDs("insert") - for idx, i := range storageNodeIds { - // TODO: introduce TLS webserver config for storage nodes - storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.VLStorage.Port, cr.Spec.ClusterDomainName), idx) - } - totalNodes := len(storageNodeIds) - args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) + storageNodeFlag := build.NewFlag("-storageNode", "") + storageNodeIds := cr.AvailableStorageNodeIDs(vmv1beta1.ClusterComponentInsert) + for idx, i := range storageNodeIds { + // TODO: introduce TLS webserver config for storage nodes + storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.VLStorage.Port, cr.Spec.ClusterDomainName), idx) } + totalNodes := len(storageNodeIds) + args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) if len(cr.Spec.VLInsert.ExtraEnvs) > 0 || len(cr.Spec.VLInsert.ExtraEnvsFrom) > 0 { args = append(args, "-envflag.enable=true") } diff --git a/internal/controller/operator/factory/vlcluster/vlselect.go b/internal/controller/operator/factory/vlcluster/vlselect.go index 1a5cab8fa..1b9f344e8 100644 --- a/internal/controller/operator/factory/vlcluster/vlselect.go +++ b/internal/controller/operator/factory/vlcluster/vlselect.go @@ -190,6 +190,7 @@ func createOrUpdateVLSelectDeployment(ctx context.Context, rclient client.Client PatchSpec: func(existingSpec, newSpec *appsv1.DeploymentSpec) { if cr.Spec.VLSelect.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.VLSelect.ReplicaCount = existingSpec.Replicas } }, } @@ -245,12 +246,9 @@ func buildVLSelectPodSpec(cr *vmv1.VLCluster) (*corev1.PodTemplateSpec, error) { } storageNodeFlag := build.NewFlag("-storageNode", "") - storageNodeIds := cr.AvailableStorageNodeIDs("select") - if cr.Spec.VLStorage != nil && cr.Spec.VLStorage.ReplicaCount != nil { - // TODO: check TLS - for idx, i := range storageNodeIds { - storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.VLStorage.Port, cr.Spec.ClusterDomainName), idx) - } + storageNodeIds := cr.AvailableStorageNodeIDs(vmv1beta1.ClusterComponentSelect) + for idx, i := range storageNodeIds { + storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.VLStorage.Port, cr.Spec.ClusterDomainName), idx) } if len(cr.Spec.VLSelect.ExtraStorageNodes) > 0 { for i, node := range cr.Spec.VLSelect.ExtraStorageNodes { diff --git a/internal/controller/operator/factory/vlcluster/vlstorage.go b/internal/controller/operator/factory/vlcluster/vlstorage.go index 2a2b491b4..6d789f52e 100644 --- a/internal/controller/operator/factory/vlcluster/vlstorage.go +++ b/internal/controller/operator/factory/vlcluster/vlstorage.go @@ -172,6 +172,7 @@ func createOrUpdateVLStorageSTS(ctx context.Context, rclient client.Client, cr, PatchSpec: func(existingSpec, newSpec *appsv1.StatefulSetSpec) { if cr.Spec.VLStorage.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.VLStorage.ReplicaCount = existingSpec.Replicas } }, } diff --git a/internal/controller/operator/factory/vmagent/nodescrape.go b/internal/controller/operator/factory/vmagent/nodescrape.go index f73444e5b..6cdf5c690 100644 --- a/internal/controller/operator/factory/vmagent/nodescrape.go +++ b/internal/controller/operator/factory/vmagent/nodescrape.go @@ -105,7 +105,6 @@ func generateNodeScrapeConfig( for _, trc := range sp.NodeScrapeRelabelTemplate { relabelings = append(relabelings, generateRelabelConfig(trc)) } - // Because of security risks, whenever enforcedNamespaceLabel is set, we want to append it to the // relabel_configs as the last relabeling, to ensure it overrides any other relabelings. relabelings = enforceNamespaceLabel(relabelings, sc.Namespace, se.EnforcedNamespaceLabel) diff --git a/internal/controller/operator/factory/vmalertmanager/vmalertmanager_reconcile_test.go b/internal/controller/operator/factory/vmalertmanager/vmalertmanager_reconcile_test.go index d5c499260..9c310499e 100644 --- a/internal/controller/operator/factory/vmalertmanager/vmalertmanager_reconcile_test.go +++ b/internal/controller/operator/factory/vmalertmanager/vmalertmanager_reconcile_test.go @@ -160,7 +160,8 @@ func Test_CreateOrUpdate_Actions(t *testing.T) { {Verb: "Get", Kind: "Service", Resource: vmalertmanagerName}, {Verb: "Get", Kind: "VMServiceScrape", Resource: vmalertmanagerName}, {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, + {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, // patchSTSCurrentRevision }, }) @@ -190,7 +191,8 @@ func Test_CreateOrUpdate_Actions(t *testing.T) { {Verb: "Get", Kind: "Service", Resource: vmalertmanagerName}, {Verb: "Get", Kind: "VMServiceScrape", Resource: vmalertmanagerName}, {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, + {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmalertmanagerName}, // patchSTSCurrentRevision }, }) } diff --git a/internal/controller/operator/factory/vmanomaly/config.go b/internal/controller/operator/factory/vmanomaly/config.go index 5e66e4369..e7a065cda 100644 --- a/internal/controller/operator/factory/vmanomaly/config.go +++ b/internal/controller/operator/factory/vmanomaly/config.go @@ -25,7 +25,7 @@ func createOrUpdateConfig(ctx context.Context, rclient client.Client, cr, prevCR newSecretConfig := &corev1.Secret{ ObjectMeta: build.ResourceMeta(build.SecretConfigResourceKind, cr), Data: map[string][]byte{ - secretConfigKey: data, + configEnvsubstFilename: data, }, } owner := cr.AsOwner() diff --git a/internal/controller/operator/factory/vmanomaly/config/config.go b/internal/controller/operator/factory/vmanomaly/config/config.go index 7f9e88d60..38c1149c5 100644 --- a/internal/controller/operator/factory/vmanomaly/config/config.go +++ b/internal/controller/operator/factory/vmanomaly/config/config.go @@ -21,6 +21,32 @@ type validatable interface { validate() error } +type PartialConfig struct { + Schedulers map[string]*scheduler `yaml:"schedulers,omitempty"` + Models map[string]*model `yaml:"models,omitempty"` + Queries map[string]*query `yaml:"queries,omitempty"` +} + +func (pc *PartialConfig) Validate() error { + for name, s := range pc.Schedulers { + if s == nil { + return fmt.Errorf("scheduler=%q is nil", name) + } + if err := s.validate(); err != nil { + return fmt.Errorf("failed to validate scheduler=%q: %w", name, err) + } + } + for name, m := range pc.Models { + if m == nil { + return fmt.Errorf("model=%q is nil", name) + } + if err := m.validate(); err != nil { + return fmt.Errorf("failed to validate model=%q: %w", name, err) + } + } + return nil +} + type config struct { Schedulers map[string]*scheduler `yaml:"schedulers,omitempty"` Models map[string]*model `yaml:"models,omitempty"` @@ -33,30 +59,39 @@ type config struct { } type server struct { - Addr string `yaml:"addr,omitempty"` - Port string `yaml:"port,omitempty"` - PathPrefix string `yaml:"path_prefix,omitempty"` - MaxConcurrentTasks int `yaml:"max_concurrent_tasks,omitempty"` - UIDefaultState string `yaml:"ui_default_state,omitempty"` + Addr string `yaml:"addr,omitempty"` + Port string `yaml:"port,omitempty"` + PathPrefix string `yaml:"path_prefix,omitempty"` + MaxConcurrentTasks int `yaml:"max_concurrent_tasks,omitempty"` + UIDefaultState string `yaml:"ui_default_state,omitempty"` + UseReaderConnectionSettings bool `yaml:"use_reader_connection_settings,omitempty"` } func (s *server) validate() error { if s == nil { return nil } - if s.MaxConcurrentTasks != 0 && (s.MaxConcurrentTasks < 1 || s.MaxConcurrentTasks > 20) { - return fmt.Errorf("max_concurrent_tasks must be between 1 and 20, got %d", s.MaxConcurrentTasks) + if s.MaxConcurrentTasks < 0 { + return fmt.Errorf("max_concurrent_tasks must be a positive integer, got %d", s.MaxConcurrentTasks) } return nil } +type retention struct { + TTL duration `yaml:"ttl,omitempty"` + CheckInterval duration `yaml:"check_interval,omitempty"` +} + type settings struct { - Workers int `yaml:"n_workers,omitempty"` - ScoreOutsideRange float64 `yaml:"anomaly_score_outside_data_range,omitempty"` - RestoreState bool `yaml:"restore_state,omitempty"` + Workers int `yaml:"n_workers,omitempty"` + // ScoreOutsideRange is a pointer so an explicit 0.0 survives marshalling. + ScoreOutsideRange *float64 `yaml:"anomaly_score_outside_data_range,omitempty"` + RestoreState bool `yaml:"restore_state,omitempty"` + Retention *retention `yaml:"retention,omitempty"` + LoggerLevels map[string]string `yaml:"logger_levels,omitempty"` } -func (c *config) override(cr *vmv1.VMAnomaly, ac *build.AssetsCache) error { +func (c *config) build(cr *vmv1.VMAnomaly, ac *build.AssetsCache) error { crCanonicalName := strings.Join([]string{cr.Namespace, cr.Name}, "/") if cr.Spec.Server != nil { srv := cr.Spec.Server @@ -72,6 +107,8 @@ func (c *config) override(cr *vmv1.VMAnomaly, ac *build.AssetsCache) error { } c.Preset = strings.ToLower(c.Preset) if strings.HasPrefix(c.Preset, "ui") { + s := new(noopScheduler) + s.setClass("noop") c.Reader = &reader{ Class: "noop", } @@ -80,9 +117,7 @@ func (c *config) override(cr *vmv1.VMAnomaly, ac *build.AssetsCache) error { } c.Schedulers = map[string]*scheduler{ "noop": { - validatable: &noopScheduler{ - Class: "noop", - }, + anomalyScheduler: s, }, } c.Models = map[string]*model{ @@ -98,7 +133,7 @@ func (c *config) override(cr *vmv1.VMAnomaly, ac *build.AssetsCache) error { c.Monitoring = &monitoring{ Pull: &endpoint{ Addr: "0.0.0.0", - Port: cr.Spec.Monitoring.Pull.Port, + Port: cr.ProbePort(), }, } return nil @@ -166,6 +201,7 @@ func (c *config) override(cr *vmv1.VMAnomaly, ac *build.AssetsCache) error { } c.Monitoring = &m } + return nil } @@ -285,9 +321,15 @@ type clientConfig struct { Password string `yaml:"password,omitempty"` BearerToken string `yaml:"bearer_token,omitempty"` BearerTokenFile string `yaml:"bearer_token_file,omitempty"` - VerifyTLS bool `yaml:"verify_tls,omitempty"` - TLSCertFile string `yaml:"tls_cert_file,omitempty"` - TLSKeyFile string `yaml:"tls_key_file,omitempty"` + // VerifyTLS mirrors vmanomaly's `verify_tls` option, which is overloaded: + // false disables verification, true uses the system CA store and a string + // is treated as a path to the CA bundle to verify against. + // See: + // https://docs.victoriametrics.com/anomaly-detection/components/writer/#config-parameters + // https://docs.victoriametrics.com/anomaly-detection/components/reader/#config-parameters + VerifyTLS any `yaml:"verify_tls,omitempty"` + TLSCertFile string `yaml:"tls_cert_file,omitempty"` + TLSKeyFile string `yaml:"tls_key_file,omitempty"` } func (c *clientConfig) override(cr *vmv1.VMAnomaly, cfg *vmv1.VMAnomalyHTTPClientSpec, ac *build.AssetsCache) error { @@ -298,7 +340,15 @@ func (c *clientConfig) override(cr *vmv1.VMAnomaly, cfg *vmv1.VMAnomalyHTTPClien } c.TLSCertFile = creds.CertFile c.TLSKeyFile = creds.KeyFile - c.VerifyTLS = !cfg.TLSConfig.InsecureSkipVerify + switch { + case cfg.TLSConfig.InsecureSkipVerify: + c.VerifyTLS = false + case creds.CAFile != "": + // vmanomaly expects the CA bundle path to be passed via `verify_tls`. + c.VerifyTLS = creds.CAFile + default: + c.VerifyTLS = true + } } if cfg.BasicAuth != nil { creds, err := ac.BuildBasicAuthCreds(cr.Namespace, cfg.BasicAuth) @@ -339,7 +389,7 @@ func Load(cr *vmv1.VMAnomaly, ac *build.AssetsCache) ([]byte, error) { if err != nil { return nil, fmt.Errorf("failed to unmarshal anomaly configuration, name=%q: %w", cr.Name, err) } - if err = c.override(cr, ac); err != nil { + if err = c.build(cr, ac); err != nil { return nil, fmt.Errorf("failed to update secret values with values from anomaly instance, name=%q: %w", cr.Name, err) } if err = c.validate(); err != nil { diff --git a/internal/controller/operator/factory/vmanomaly/config/config_test.go b/internal/controller/operator/factory/vmanomaly/config/config_test.go index 366009e58..2fff9f8d5 100644 --- a/internal/controller/operator/factory/vmanomaly/config/config_test.go +++ b/internal/controller/operator/factory/vmanomaly/config/config_test.go @@ -179,6 +179,7 @@ settings: "label2": "value2", }, }, + ConnectionRetryAttempts: 3, VMAnomalyHTTPClientSpec: vmv1.VMAnomalyHTTPClientSpec{ TenantID: "0:2", TLSConfig: &vmv1beta1.TLSConfig{ @@ -211,6 +212,7 @@ settings: DatasourceURL: "http://custom.ds", QueryRangePath: "/api/v1/query_range", SamplingPeriod: "10s", + Offset: "5m", VMAnomalyHTTPClientSpec: vmv1.VMAnomalyHTTPClientSpec{ TenantID: "0:1", TLSConfig: &vmv1beta1.TLSConfig{ @@ -272,6 +274,7 @@ reader: datasource_url: http://custom.ds sampling_period: 10s query_range_path: /api/v1/query_range + offset: 5m queries: test: expr: vm_metric @@ -279,7 +282,7 @@ reader: - "0" - inf tenant_id: "0:1" - verify_tls: true + verify_tls: /test/monitoring_tls_remote-ca tls_cert_file: /test/monitoring_tls_remote-cert tls_key_file: /test/monitoring_tls_remote-key writer: @@ -290,8 +293,9 @@ writer: for: custom_$QUERY_KEY label1: value1 label2: value2 + connection_retry_attempts: 3 tenant_id: "0:2" - verify_tls: true + verify_tls: /test/monitoring_tls_remote-ca tls_cert_file: /test/monitoring_tls_remote-cert tls_key_file: /test/monitoring_tls_remote-key monitoring: @@ -300,7 +304,7 @@ monitoring: push: url: http://monitoring tenant_id: "0:3" - verify_tls: true + verify_tls: /test/monitoring_tls_remote-ca tls_cert_file: /test/monitoring_tls_remote-cert tls_key_file: /test/monitoring_tls_remote-key push_frequency: 20s @@ -308,6 +312,306 @@ monitoring: label1: value1 settings: restore_state: true +server: + port: "8490" +`, + }) + + // TLS without a CA bundle and InsecureSkipVerify=false => verify_tls: true + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{ + Key: ptr.To("test"), + }, + ConfigRawYaml: ` +models: + model_zscore: + class: 'zscore' + z_threshold: 2.5 + queries: ['test_query'] +schedulers: + scheduler_1m: + class: "scheduler.periodic.PeriodicScheduler" + infer_every: 1m + fit_every: 2m + fit_window: 3h +reader: + queries: + test_query: + expr: vm_metric +writer: + datasource_url: "http://test.com" +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader.test", + SamplingPeriod: "30s", + VMAnomalyHTTPClientSpec: vmv1.VMAnomalyHTTPClientSpec{ + TLSConfig: &vmv1beta1.TLSConfig{ + Cert: vmv1beta1.SecretOrConfigMap{ + Secret: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "tls"}, + Key: "cert", + }, + }, + KeySecret: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "tls"}, + Key: "key", + }, + }, + }, + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer.test", + }, + }, + }, + predefinedObjects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tls", + Namespace: "monitoring", + }, + Data: map[string][]byte{ + "cert": []byte("cert"), + "key": []byte("key"), + }, + }, + }, + expected: ` +models: + model_zscore: + class: zscore + queries: + - test_query + z_threshold: 2.5 +schedulers: + scheduler_1m: + class: scheduler.periodic.PeriodicScheduler + fit_every: 2m + fit_window: 3h + infer_every: 1m +reader: + class: vm + datasource_url: http://reader.test + sampling_period: 30s + queries: + test_query: + expr: vm_metric + verify_tls: true + tls_cert_file: /test/monitoring_tls_cert + tls_key_file: /test/monitoring_tls_key +writer: + class: vm + datasource_url: http://writer.test +monitoring: + pull: + port: "8080" +server: + port: "8490" +`, + }) + + // InsecureSkipVerify=true takes precedence over a provided CA => verify_tls: false + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{ + Key: ptr.To("test"), + }, + ConfigRawYaml: ` +models: + model_zscore: + class: 'zscore' + z_threshold: 2.5 + queries: ['test_query'] +schedulers: + scheduler_1m: + class: "scheduler.periodic.PeriodicScheduler" + infer_every: 1m + fit_every: 2m + fit_window: 3h +reader: + queries: + test_query: + expr: vm_metric +writer: + datasource_url: "http://test.com" +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader.test", + SamplingPeriod: "30s", + VMAnomalyHTTPClientSpec: vmv1.VMAnomalyHTTPClientSpec{ + TLSConfig: &vmv1beta1.TLSConfig{ + InsecureSkipVerify: true, + CA: vmv1beta1.SecretOrConfigMap{ + Secret: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "tls"}, + Key: "ca", + }, + }, + Cert: vmv1beta1.SecretOrConfigMap{ + Secret: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "tls"}, + Key: "cert", + }, + }, + KeySecret: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "tls"}, + Key: "key", + }, + }, + }, + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer.test", + }, + }, + }, + predefinedObjects: []runtime.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tls", + Namespace: "monitoring", + }, + Data: map[string][]byte{ + "ca": []byte("ca"), + "cert": []byte("cert"), + "key": []byte("key"), + }, + }, + }, + expected: ` +models: + model_zscore: + class: zscore + queries: + - test_query + z_threshold: 2.5 +schedulers: + scheduler_1m: + class: scheduler.periodic.PeriodicScheduler + fit_every: 2m + fit_window: 3h + infer_every: 1m +reader: + class: vm + datasource_url: http://reader.test + sampling_period: 30s + queries: + test_query: + expr: vm_metric + verify_tls: false + tls_cert_file: /test/monitoring_tls_cert + tls_key_file: /test/monitoring_tls_key +writer: + class: vm + datasource_url: http://writer.test +monitoring: + pull: + port: "8080" +server: + port: "8490" +`, + }) + + // with settings including retention + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{ + Key: ptr.To("test"), + }, + ConfigRawYaml: ` +models: + model_zscore: + class: 'zscore' + z_threshold: 2.5 + queries: ['test_query'] +schedulers: + scheduler_backtesting: + class: "backtesting" + fit_window: 3h + fit_every: 1h + from_s: 1000 + to_s: 2000 + exact: true + infer_every: 5m +reader: + queries: + test_query: + expr: vm_metric +writer: + datasource_url: "http://test.com" +settings: + restore_state: true + retention: + ttl: 24h + check_interval: 30m + logger_levels: + root: DEBUG +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader.test", + SamplingPeriod: "30s", + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer.test", + }, + }, + }, + expected: ` +models: + model_zscore: + class: zscore + queries: + - test_query + z_threshold: 2.5 +schedulers: + scheduler_backtesting: + class: backtesting + fit_window: 3h + from_iso: 0001-01-01T00:00:00Z + from_s: 1000 + to_iso: 0001-01-01T00:00:00Z + to_s: 2000 + fit_every: 1h + exact: true + infer_every: 5m +reader: + class: vm + datasource_url: http://reader.test + sampling_period: 30s + queries: + test_query: + expr: vm_metric +writer: + class: vm + datasource_url: http://writer.test +monitoring: + pull: + port: "8080" +settings: + restore_state: true + retention: + ttl: 24h + check_interval: 30m + logger_levels: + root: DEBUG +server: + port: "8490" `, }) @@ -349,10 +653,11 @@ writer: DatasourceURL: "http://writer.test", }, Server: &vmv1.VMAnomalyServerSpec{ - Addr: "127.0.0.1", - Port: "9090", - PathPrefix: "my-anomaly", - MaxConcurrentTasks: 10, + Addr: "127.0.0.1", + Port: "9090", + PathPrefix: "my-anomaly", + MaxConcurrentTasks: 10, + UseReaderConnectionSettings: true, }, }, }, @@ -387,6 +692,7 @@ server: port: "9090" path_prefix: my-anomaly max_concurrent_tasks: 10 + use_reader_connection_settings: true `, }) @@ -463,6 +769,8 @@ writer: monitoring: pull: port: "8080" +server: + port: "8490" `, }) @@ -485,6 +793,7 @@ models: scale: [0.5, 1.5] min_subseason: hourly decay: 0.5 + global_smoothing: 0.5 schedulers: scheduler_1m: class: "scheduler.periodic.PeriodicScheduler" @@ -495,6 +804,7 @@ reader: queries: test_query: expr: vm_metric + offset: 1m writer: datasource_url: "http://test.com" `, @@ -518,6 +828,7 @@ models: - 1.5 decay: 0.5 min_subseason: hourly + global_smoothing: 0.5 schedulers: scheduler_1m: class: scheduler.periodic.PeriodicScheduler @@ -531,16 +842,109 @@ reader: queries: test_query: expr: vm_metric + offset: 1m writer: class: vm datasource_url: http://writer.test monitoring: pull: port: "8080" +server: + port: "8490" `, }) - // server section validation error - maxConcurrentTasks out of range + // ui preset with nil monitoring - must not panic + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly-ui", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{ + Key: ptr.To("test"), + }, + ConfigRawYaml: `preset: ui`, + Server: &vmv1.VMAnomalyServerSpec{ + PathPrefix: "/", + }, + // Monitoring intentionally nil to reproduce the panic + }, + }, + expected: ` +models: + placeholder: + class: zscore + schedulers: + - noop +schedulers: + noop: + class: noop +reader: + class: noop + datasource_url: "" + sampling_period: null +writer: + class: noop + datasource_url: "" +monitoring: + pull: + addr: 0.0.0.0 + port: "8080" +server: + port: "8490" + path_prefix: / +preset: ui +`, + }) + + // ui preset with explicit monitoring pull port + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly-ui-monitoring", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{ + Key: ptr.To("test"), + }, + ConfigRawYaml: `preset: ui`, + Monitoring: &vmv1.VMAnomalyMonitoringSpec{ + Pull: &vmv1.VMAnomalyMonitoringPullSpec{ + Port: "9999", + }, + }, + }, + }, + expected: ` +models: + placeholder: + class: zscore + schedulers: + - noop +schedulers: + noop: + class: noop +reader: + class: noop + datasource_url: "" + sampling_period: null +writer: + class: noop + datasource_url: "" +monitoring: + pull: + addr: 0.0.0.0 + port: "9999" +server: + port: "8490" +preset: ui +`, + }) + + // server section validation error - maxConcurrentTasks must be a positive integer f(opts{ cr: &vmv1.VMAnomaly{ ObjectMeta: metav1.ObjectMeta{ @@ -578,10 +982,219 @@ writer: DatasourceURL: "http://writer.test", }, Server: &vmv1.VMAnomalyServerSpec{ - MaxConcurrentTasks: 25, // out of range (1-20) + MaxConcurrentTasks: -1, // negative is invalid; vmanomaly imposes no upper bound }, }, }, wantErr: true, }) + + // tz is serialized as a string (reader/query/scheduler), an explicit zero + // anomaly_score_outside_data_range survives marshalling, and an unset decay is omitted + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{Key: ptr.To("test")}, + ConfigRawYaml: ` +settings: + anomaly_score_outside_data_range: 0 +models: + m_online: + class: zscore_online + queries: ['q1'] +schedulers: + s1: + class: periodic + infer_every: 1m + fit_window: 1h + tz: "Europe/Kyiv" +reader: + queries: + q1: + expr: up + tz: "America/New_York" +writer: {} +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader.test", + SamplingPeriod: "30s", + Timezone: "UTC", + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer.test", + }, + }, + }, + expected: ` +models: + m_online: + class: zscore_online + queries: + - q1 +schedulers: + s1: + class: periodic + fit_window: 1h + infer_every: 1m + tz: Europe/Kyiv +reader: + class: vm + datasource_url: http://reader.test + sampling_period: 30s + tz: UTC + queries: + q1: + expr: up + tz: America/New_York +writer: + class: vm + datasource_url: http://writer.test +monitoring: + pull: + port: "8080" +settings: + anomaly_score_outside_data_range: 0 +server: + port: "8490" +`, + }) + + // contamination accepts a float + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{Key: ptr.To("test")}, + ConfigRawYaml: ` +models: + m_iforest: + class: isolation_forest + queries: ['q1'] + contamination: 0.05 +schedulers: + s1: + class: periodic + infer_every: 1m + fit_window: 1h +reader: + queries: + q1: + expr: up +writer: {} +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader.test", + SamplingPeriod: "30s", + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer.test", + }, + Server: &vmv1.VMAnomalyServerSpec{ + MaxConcurrentTasks: 50, // no upper bound + }, + }, + }, + expected: ` +models: + m_iforest: + class: isolation_forest + queries: + - q1 + contamination: 0.05 +schedulers: + s1: + class: periodic + fit_window: 1h + infer_every: 1m +reader: + class: vm + datasource_url: http://reader.test + sampling_period: 30s + queries: + q1: + expr: up +writer: + class: vm + datasource_url: http://writer.test +monitoring: + pull: + port: "8080" +server: + port: "8490" + max_concurrent_tasks: 50 +`, + }) + + // contamination accepts the string "auto" + f(opts{ + cr: &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-anomaly", + Namespace: "monitoring", + }, + Spec: vmv1.VMAnomalySpec{ + License: &vmv1beta1.License{Key: ptr.To("test")}, + ConfigRawYaml: ` +models: + m_iforest: + class: isolation_forest + queries: ['q1'] + contamination: auto +schedulers: + s1: + class: periodic + infer_every: 1m + fit_window: 1h +reader: + queries: + q1: + expr: up +writer: {} +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader.test", + SamplingPeriod: "30s", + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer.test", + }, + }, + }, + expected: ` +models: + m_iforest: + class: isolation_forest + queries: + - q1 + contamination: auto +schedulers: + s1: + class: periodic + fit_window: 1h + infer_every: 1m +reader: + class: vm + datasource_url: http://reader.test + sampling_period: 30s + queries: + q1: + expr: up +writer: + class: vm + datasource_url: http://writer.test +monitoring: + pull: + port: "8080" +server: + port: "8490" +`, + }) + } diff --git a/internal/controller/operator/factory/vmanomaly/config/models.go b/internal/controller/operator/factory/vmanomaly/config/models.go index 177e34718..8d49c9362 100644 --- a/internal/controller/operator/factory/vmanomaly/config/models.go +++ b/internal/controller/operator/factory/vmanomaly/config/models.go @@ -16,67 +16,55 @@ const ( ) type commonModelParams struct { - Class string `yaml:"class"` - Queries []string `yaml:"queries,omitempty"` - Schedulers []string `yaml:"schedulers,omitempty"` - ProvideSeries []string `yaml:"provide_series,omitempty"` - DetectionDirection modelDetectionDirection `yaml:"detection_direction,omitempty"` - MinDevFromExpected float64 `yaml:"min_dev_from_expected,omitempty"` - GroupBy []string `yaml:"groupby,omitempty"` - Scale []float64 `yaml:"scale,omitempty"` - ClipPredictions bool `yaml:"clip_predictions,omitempty"` - ScoreOutsideDataRange float64 `yaml:"anomaly_score_outside_data_range,omitempty"` + Class string `yaml:"class"` + Queries []string `yaml:"queries,omitempty"` + Schedulers []string `yaml:"schedulers,omitempty"` + ProvideSeries []string `yaml:"provide_series,omitempty"` + DetectionDirection modelDetectionDirection `yaml:"detection_direction,omitempty"` + MinDevFromExpected float64 `yaml:"min_dev_from_expected,omitempty"` + GroupBy []string `yaml:"groupby,omitempty"` + Scale []float64 `yaml:"scale,omitempty"` + ClipPredictions bool `yaml:"clip_predictions,omitempty"` + // ScoreOutsideDataRange is a pointer so an explicit 0.0 survives marshalling. + ScoreOutsideDataRange *float64 `yaml:"anomaly_score_outside_data_range,omitempty"` } func (p commonModelParams) queries() []string { return p.Queries } +func (p commonModelParams) addPrefix(prefix string) { + for i := range p.Schedulers { + p.Schedulers[i] = fmt.Sprintf("%s-%s", prefix, p.Schedulers[i]) + } + for i := range p.Queries { + p.Queries[i] = fmt.Sprintf("%s-%s", prefix, p.Queries[i]) + } +} + func (p commonModelParams) schedulers() []string { return p.Schedulers } +func (p *commonModelParams) setClass(class string) { + p.Class = class +} + type anomalyModel interface { validatable + setClass(string) schedulers() []string queries() []string + addPrefix(string) } type model struct { anomalyModel } -var ( - _ yaml.Marshaler = (*model)(nil) - _ yaml.Unmarshaler = (*model)(nil) -) - -// MarshalYAML implements yaml.Marshaller interface -func (m *model) MarshalYAML() (any, error) { - return m.anomalyModel, nil -} - -type onlineModel struct { - Decay float64 `yaml:"decay,omitempty"` -} - -func (m *onlineModel) validate() error { - // See https://docs.victoriametrics.com/anomaly-detection/components/models/#decay - // Valid values are in the range [0, 1]. - if m.Decay < 0 || m.Decay > 1 { - return fmt.Errorf("decay must be in range [0, 1], got %f", m.Decay) - } - return nil -} - -// UnmarshalYAML implements yaml.Unmarshaler interface -func (m *model) UnmarshalYAML(unmarshal func(any) error) error { - var h header - if err := unmarshal(&h); err != nil { - return err - } +func (m *model) init(class string) error { var mdl anomalyModel - switch h.Class { + switch class { case "model.auto.AutoTunedModel", "auto": mdl = new(autoTunedModel) case "model.prophet.ProphetModel", "prophet": @@ -102,12 +90,57 @@ func (m *model) UnmarshalYAML(unmarshal func(any) error) error { case "model.isolation_forest.IsolationForestMultivariateModel", "isolation_forest_multivariate": mdl = new(isolationForestMultivariateModel) default: - return fmt.Errorf("model class=%q is not supported", h.Class) + return fmt.Errorf("model class=%q is not supported", class) } - if err := unmarshal(mdl); err != nil { + m.anomalyModel = mdl + return nil +} + +var ( + _ yaml.Marshaler = (*model)(nil) + _ yaml.Unmarshaler = (*model)(nil) +) + +// Validate validates raw config +func (m *model) Validate(data []byte) error { + if err := yaml.Unmarshal(data, m); err != nil { + return err + } + return m.validate() +} + +// MarshalYAML implements yaml.Marshaller interface +func (m *model) MarshalYAML() (any, error) { + return m.anomalyModel, nil +} + +// UnmarshalYAML implements yaml.Unmarshaler interface +func (m *model) UnmarshalYAML(unmarshal func(any) error) error { + var h header + if err := unmarshal(&h); err != nil { + return err + } + if err := m.init(h.Class); err != nil { + return err + } + if err := unmarshal(m.anomalyModel); err != nil { return err } - m.anomalyModel = mdl + return nil +} + +type onlineModel struct { + // Decay is a pointer to distinguish "unset" (omitted, vmanomaly applies its default) + // from an explicit value, which must be in the range (0, 1]. + Decay *float64 `yaml:"decay,omitempty"` +} + +func (m *onlineModel) validate() error { + // See https://docs.victoriametrics.com/anomaly-detection/components/models/#decay + // Valid values are in the range (0, 1]; unset is allowed and defaulted by vmanomaly. + if m.Decay != nil && (*m.Decay <= 0 || *m.Decay > 1) { + return fmt.Errorf("decay must be in range (0, 1], got %f", *m.Decay) + } return nil } @@ -126,7 +159,7 @@ type autoTunedOptimizationParams struct { OptimizedBusinessParams []string `yaml:"optimized_business_params,omitempty"` Seed int `yaml:"seed,omitempty"` Splits int `yaml:"n_splits,omitempty"` - Trails int `yaml:"n_trails,omitempty"` + Trials int `yaml:"n_trials,omitempty"` Timeout *duration `yaml:"timeout,omitempty"` } @@ -144,9 +177,10 @@ func (m *holtWintersModel) validate() error { type isolationForestModel struct { commonModelParams `yaml:",inline"` - Contamination string `yaml:"contamination,omitempty"` - SeasonalFeatures []string `yaml:"seasonal_features,omitempty"` - Args map[string]any `yaml:"args,omitempty"` + // Contamination is a float (e.g. 0.01) or the string "auto"; vmanomaly accepts both. + Contamination any `yaml:"contamination,omitempty"` + SeasonalFeatures []string `yaml:"seasonal_features,omitempty"` + Args map[string]any `yaml:"args,omitempty"` } func (m *isolationForestModel) validate() error { @@ -155,9 +189,10 @@ func (m *isolationForestModel) validate() error { type isolationForestMultivariateModel struct { commonModelParams `yaml:",inline"` - Contamination string `yaml:"contamination,omitempty"` - SeasonalFeatures []string `yaml:"seasonal_features,omitempty"` - Args map[string]any `yaml:"args,omitempty"` + // Contamination is a float (e.g. 0.01) or the string "auto"; vmanomaly accepts both. + Contamination any `yaml:"contamination,omitempty"` + SeasonalFeatures []string `yaml:"seasonal_features,omitempty"` + Args map[string]any `yaml:"args,omitempty"` } func (m *isolationForestMultivariateModel) validate() error { @@ -193,12 +228,13 @@ type onlineQuantileModel struct { onlineModel `yaml:",inline"` Quantiles []float64 `yaml:"quantiles,omitempty"` SeasonalInterval *duration `yaml:"seasonal_interval,omitempty"` - MinSubseason string `yaml:"min_subseason"` + MinSubseason string `yaml:"min_subseason,omitempty"` UseTransform bool `yaml:"use_transform,omitempty"` - GlobalSmoothing float64 `yaml:"global_smooth,omitempty"` + GlobalSmoothing float64 `yaml:"global_smoothing,omitempty"` SeasonStartsFrom time.Time `yaml:"season_starts_from,omitempty"` MinSamplesSeen int `yaml:"min_n_samples_seen,omitempty"` Compression int `yaml:"compression,omitempty"` + IqrThreshold float64 `yaml:"iqr_threshold,omitempty"` } func (m *onlineQuantileModel) validate() error { diff --git a/internal/controller/operator/factory/vmanomaly/config/readers.go b/internal/controller/operator/factory/vmanomaly/config/readers.go index 068c047d1..9d103c225 100644 --- a/internal/controller/operator/factory/vmanomaly/config/readers.go +++ b/internal/controller/operator/factory/vmanomaly/config/readers.go @@ -5,22 +5,22 @@ import ( "slices" "strconv" "strings" - "time" ) type reader struct { - Class string `yaml:"class"` - DatasourceURL string `yaml:"datasource_url"` - SamplingPeriod *duration `yaml:"sampling_period"` - QueryRangePath string `yaml:"query_range_path,omitempty"` - ExtraFilters []string `yaml:"extra_filters,omitempty"` - QueryFromLastSeenTimestamp bool `yaml:"query_from_last_seen_timestamp,omitempty"` - LatencyOffset *duration `yaml:"latency_offset,omitempty"` - MaxPointsPerQuery int `yaml:"max_points_per_query,omitempty"` - Timezone time.Location `yaml:"tz,omitempty"` - DataRange []string `yaml:"data_range,omitempty"` - Queries map[string]readerQuery `yaml:"queries,omitempty"` - ClientConfig clientConfig `yaml:",inline"` + Class string `yaml:"class"` + DatasourceURL string `yaml:"datasource_url"` + SamplingPeriod *duration `yaml:"sampling_period"` + QueryRangePath string `yaml:"query_range_path,omitempty"` + ExtraFilters []string `yaml:"extra_filters,omitempty"` + QueryFromLastSeenTimestamp bool `yaml:"query_from_last_seen_timestamp,omitempty"` + LatencyOffset *duration `yaml:"latency_offset,omitempty"` + Offset *duration `yaml:"offset,omitempty"` + MaxPointsPerQuery int `yaml:"max_points_per_query,omitempty"` + Timezone string `yaml:"tz,omitempty"` + DataRange []string `yaml:"data_range,omitempty"` + Queries map[string]*query `yaml:"queries,omitempty"` + ClientConfig clientConfig `yaml:",inline"` } func (r *reader) validate() error { @@ -56,11 +56,12 @@ func (r *reader) validate() error { return nil } -type readerQuery struct { - Expr string `yaml:"expr"` - Step *duration `yaml:"step,omitempty"` - DataRange []string `yaml:"data_range,omitempty"` - MaxPointsPerQuery int `yaml:"max_points_per_query,omitempty"` - TZ time.Location `yaml:"tz,omitempty"` - TenantID string `yaml:"tenant_id,omitempty"` +type query struct { + Expr string `yaml:"expr"` + Step *duration `yaml:"step,omitempty"` + DataRange []string `yaml:"data_range,omitempty"` + MaxPointsPerQuery int `yaml:"max_points_per_query,omitempty"` + TZ string `yaml:"tz,omitempty"` + TenantID string `yaml:"tenant_id,omitempty"` + Offset *duration `yaml:"offset,omitempty"` } diff --git a/internal/controller/operator/factory/vmanomaly/config/schedulers.go b/internal/controller/operator/factory/vmanomaly/config/schedulers.go index e425bd2e9..d2b7a5a03 100644 --- a/internal/controller/operator/factory/vmanomaly/config/schedulers.go +++ b/internal/controller/operator/factory/vmanomaly/config/schedulers.go @@ -7,8 +7,13 @@ import ( "gopkg.in/yaml.v2" ) -type scheduler struct { +type anomalyScheduler interface { validatable + setClass(string) +} + +type scheduler struct { + anomalyScheduler } var ( @@ -16,14 +21,32 @@ var ( _ yaml.Unmarshaler = (*scheduler)(nil) ) -// UnmarshalYAML implements yaml.Unmarshaller interface +// Validate validates raw config +func (s *scheduler) Validate(data []byte) error { + if err := yaml.Unmarshal(data, s); err != nil { + return err + } + return s.validate() +} + +// UnmarshalYAML implements yaml.Unmarshaler interface func (s *scheduler) UnmarshalYAML(unmarshal func(any) error) error { var h header if err := unmarshal(&h); err != nil { return err } - var sch validatable - switch h.Class { + if err := s.init(h.Class); err != nil { + return err + } + if err := unmarshal(s.anomalyScheduler); err != nil { + return err + } + return nil +} + +func (s *scheduler) init(class string) error { + var sch anomalyScheduler + switch class { case "scheduler.periodic.PeriodicScheduler", "periodic": sch = new(periodicScheduler) case "scheduler.oneoff.OneoffScheduler", "oneoff": @@ -31,51 +54,58 @@ func (s *scheduler) UnmarshalYAML(unmarshal func(any) error) error { case "scheduler.backtesting.BacktestingScheduler", "backtesting": sch = new(backtestingScheduler) default: - return fmt.Errorf("anomaly scheduler class=%q is not supported", h.Class) + return fmt.Errorf("anomaly scheduler class=%q is not supported", class) } - if err := unmarshal(sch); err != nil { - return err - } - s.validatable = sch + s.anomalyScheduler = sch return nil } // MarshalYAML implements yaml.Marshaler interface func (s *scheduler) MarshalYAML() (any, error) { - return s.validatable, nil + return s.anomalyScheduler, nil } -type noopScheduler struct { +type commonSchedulerParams struct { Class string `yaml:"class"` } +func (p *commonSchedulerParams) setClass(class string) { + p.Class = class +} + +type noopScheduler struct { + commonSchedulerParams `yaml:",inline"` +} + func (s *noopScheduler) validate() error { return nil } +// Docs: https://docs.victoriametrics.com/anomaly-detection/components/scheduler/#parameters-1 type periodicScheduler struct { - Class string `yaml:"class"` - FitEvery *duration `yaml:"fit_every,omitempty"` - FitWindow *duration `yaml:"fit_window"` - InferEvery *duration `yaml:"infer_every"` - StartFrom time.Time `yaml:"start_from,omitempty"` - Timezone time.Location `yaml:"tz,omitempty"` + commonSchedulerParams `yaml:",inline"` + FitEvery *duration `yaml:"fit_every,omitempty"` + FitWindow *duration `yaml:"fit_window"` + InferEvery *duration `yaml:"infer_every"` + StartFrom time.Time `yaml:"start_from,omitempty"` + Timezone string `yaml:"tz,omitempty"` } func (s *periodicScheduler) validate() error { return nil } +// Docs: https://docs.victoriametrics.com/anomaly-detection/components/scheduler/#parameters-2 type oneoffScheduler struct { - Class string `yaml:"class"` - InferStartISO time.Time `yaml:"infer_start_iso,omitempty"` - InferStartS int64 `yaml:"infer_start_s,omitempty"` - InferEndISO time.Time `yaml:"infer_end_iso,omitempty"` - InferEndS int64 `yaml:"infer_end_s,omitempty"` - FitStartISO time.Time `yaml:"fit_start_iso"` - FitStartS int64 `yaml:"fit_start_s"` - FitEndISO time.Time `yaml:"fit_end_iso"` - FitEndS int64 `yaml:"fit_end_s"` + commonSchedulerParams `yaml:",inline"` + InferStartISO time.Time `yaml:"infer_start_iso,omitempty"` + InferStartS int64 `yaml:"infer_start_s,omitempty"` + InferEndISO time.Time `yaml:"infer_end_iso,omitempty"` + InferEndS int64 `yaml:"infer_end_s,omitempty"` + FitStartISO time.Time `yaml:"fit_start_iso"` + FitStartS int64 `yaml:"fit_start_s"` + FitEndISO time.Time `yaml:"fit_end_iso"` + FitEndS int64 `yaml:"fit_end_s"` } func (s *oneoffScheduler) validate() error { @@ -124,16 +154,19 @@ func (s *oneoffScheduler) validate() error { return nil } +// Docs: https://docs.victoriametrics.com/anomaly-detection/components/scheduler/#parameters-3 type backtestingScheduler struct { - Class string `yaml:"class"` - FitWindow *duration `yaml:"fit_window"` - FromISO time.Time `yaml:"from_iso"` - FromS int64 `yaml:"from_s"` - ToISO time.Time `yaml:"to_iso"` - ToS int64 `yaml:"to_s"` - FitEvery *duration `yaml:"fit_every"` - Jobs int `yaml:"n_jobs,omitempty"` - InferenceOnly bool `yaml:"inference_only,omitempty"` + commonSchedulerParams `yaml:",inline"` + FitWindow *duration `yaml:"fit_window"` + FromISO time.Time `yaml:"from_iso"` + FromS int64 `yaml:"from_s"` + ToISO time.Time `yaml:"to_iso"` + ToS int64 `yaml:"to_s"` + FitEvery *duration `yaml:"fit_every"` + Jobs int `yaml:"n_jobs,omitempty"` + InferenceOnly bool `yaml:"inference_only,omitempty"` + Exact bool `yaml:"exact,omitempty"` + InferEvery *duration `yaml:"infer_every,omitempty"` } func (s *backtestingScheduler) validate() error { diff --git a/internal/controller/operator/factory/vmanomaly/config/writers.go b/internal/controller/operator/factory/vmanomaly/config/writers.go index e77a72850..214e71f27 100644 --- a/internal/controller/operator/factory/vmanomaly/config/writers.go +++ b/internal/controller/operator/factory/vmanomaly/config/writers.go @@ -8,10 +8,11 @@ import ( // Ref: https://docs.victoriametrics.com/anomaly-detection/components/writer/#vm-writer type writer struct { - Class string `yaml:"class"` - DatasourceURL string `yaml:"datasource_url"` - MetricFormat *writerMetricFormat `yaml:"metric_format,omitempty"` - ClientConfig clientConfig `yaml:",inline"` + Class string `yaml:"class"` + DatasourceURL string `yaml:"datasource_url"` + MetricFormat *writerMetricFormat `yaml:"metric_format,omitempty"` + ConnectionRetryAttempts int `yaml:"connection_retry_attempts,omitempty"` + ClientConfig clientConfig `yaml:",inline"` } func (w *writer) validate() error { diff --git a/internal/controller/operator/factory/vmanomaly/pod.go b/internal/controller/operator/factory/vmanomaly/pod.go index 7e2d0e547..689aea242 100644 --- a/internal/controller/operator/factory/vmanomaly/pod.go +++ b/internal/controller/operator/factory/vmanomaly/pod.go @@ -18,13 +18,12 @@ import ( ) const ( - secretConfigKey = "vmanomaly.yaml" - anomalyDir = "/etc/vmanomaly" - confDir = anomalyDir + "/config" - confFile = confDir + "/vmanomaly.yaml" - tlsAssetsDir = anomalyDir + "/tls" - storageDir = "/storage" - configVolumeName = "config-volume" + anomalyDir = "/etc/vmanomaly" + confDir = anomalyDir + "/config" + tlsAssetsDir = anomalyDir + "/tls" + storageDir = "/storage" + configVolumeName = "config" + configEnvsubstFilename = "vmanomaly.env.yaml" ) func newPodSpec(cr *vmv1.VMAnomaly, ac *build.AssetsCache) (*corev1.PodSpec, error) { @@ -163,9 +162,8 @@ func newPodSpec(cr *vmv1.VMAnomaly, ac *build.AssetsCache) (*corev1.PodSpec, err } } } - // vmanomaly accepts configuration file as a last element of args - args = append(args, confFile) + args = append(args, path.Join(confDir, configEnvsubstFilename)) container := corev1.Container{ Args: args, diff --git a/internal/controller/operator/factory/vmanomaly/statefulset.go b/internal/controller/operator/factory/vmanomaly/statefulset.go index f279158b3..f07bdc9d4 100644 --- a/internal/controller/operator/factory/vmanomaly/statefulset.go +++ b/internal/controller/operator/factory/vmanomaly/statefulset.go @@ -31,8 +31,11 @@ func buildScrape(cr *vmv1.VMAnomaly) *vmv1beta1.VMPodScrape { return build.VMPodScrape(cr, "monitoring-http") } -// CreateOrUpdate creates vmanomalyand and builds config for it +// CreateOrUpdate creates vmanomaly and builds config for it func CreateOrUpdate(ctx context.Context, cr *vmv1.VMAnomaly, rclient client.Client) error { + if cr.Paused() { + return nil + } var prevCR *vmv1.VMAnomaly if cr.ParsedLastAppliedSpec != nil { prevCR = cr.DeepCopy() @@ -60,13 +63,7 @@ func CreateOrUpdate(ctx context.Context, cr *vmv1.VMAnomaly, rclient client.Clie } } - rcfg := map[build.ResourceKind]*build.ResourceCfg{ - build.TLSAssetsResourceKind: { - MountDir: tlsAssetsDir, - SecretName: build.ResourceName(build.TLSAssetsResourceKind, cr), - }, - } - ac := build.NewAssetsCache(ctx, rclient, rcfg) + ac := getAssetsCache(ctx, rclient, cr) configHash, err := createOrUpdateConfig(ctx, rclient, cr, prevCR, ac) if err != nil { return err @@ -127,13 +124,12 @@ func newK8sApp(cr *vmv1.VMAnomaly, configHash string, ac *build.AssetsCache) (*a "checksum/config": configHash, }) } - app := &appsv1.StatefulSet{ ObjectMeta: metav1.ObjectMeta{ Name: build.ShardName(cr), Namespace: cr.GetNamespace(), Labels: cr.FinalLabels(), - Annotations: cr.FinalAnnotations(), + Annotations: podAnnotations, OwnerReferences: []metav1.OwnerReference{cr.AsOwner()}, }, Spec: appsv1.StatefulSetSpec{ @@ -147,7 +143,7 @@ func newK8sApp(cr *vmv1.VMAnomaly, configHash string, ac *build.AssetsCache) (*a Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: build.ShardPodLabels(cr), - Annotations: podAnnotations, + Annotations: cr.PodAnnotations(), }, Spec: *podSpec, }, @@ -289,3 +285,13 @@ func getShard(cr *vmv1.VMAnomaly, appTpl *appsv1.StatefulSet, num int32) (*appsv patchShardContainers(app.Spec.Template.Spec.Containers, num, cr.GetShardCount()) return app, nil } + +func getAssetsCache(ctx context.Context, rclient client.Client, cr *vmv1.VMAnomaly) *build.AssetsCache { + cfg := map[build.ResourceKind]*build.ResourceCfg{ + build.TLSAssetsResourceKind: { + MountDir: tlsAssetsDir, + SecretName: build.ResourceName(build.TLSAssetsResourceKind, cr), + }, + } + return build.NewAssetsCache(ctx, rclient, cfg) +} diff --git a/internal/controller/operator/factory/vmanomaly/statefulset_test.go b/internal/controller/operator/factory/vmanomaly/statefulset_test.go index 981b69623..451ceb1f3 100644 --- a/internal/controller/operator/factory/vmanomaly/statefulset_test.go +++ b/internal/controller/operator/factory/vmanomaly/statefulset_test.go @@ -18,6 +18,7 @@ import ( vmv1 "github.com/VictoriaMetrics/operator/api/operator/v1" vmv1beta1 "github.com/VictoriaMetrics/operator/api/operator/v1beta1" + "github.com/VictoriaMetrics/operator/internal/config" "github.com/VictoriaMetrics/operator/internal/controller/operator/factory/build" "github.com/VictoriaMetrics/operator/internal/controller/operator/factory/k8stools" ) @@ -25,6 +26,7 @@ import ( func TestCreateOrUpdate(t *testing.T) { type opts struct { cr *vmv1.VMAnomaly + cfgMutator func(*config.BaseOperatorConf) validate func(sts *appsv1.StatefulSet, idx int) wantErr bool predefinedObjects []runtime.Object @@ -35,6 +37,14 @@ func TestCreateOrUpdate(t *testing.T) { fclient := k8stools.GetTestClientWithObjects(o.predefinedObjects) build.AddDefaults(fclient.Scheme()) fclient.Scheme().Default(o.cr) + cfg := config.MustGetBaseConfig() + if o.cfgMutator != nil { + defaultCfg := *cfg + o.cfgMutator(cfg) + defer func() { + *config.MustGetBaseConfig() = defaultCfg + }() + } err := CreateOrUpdate(ctx, o.cr, fclient) if o.wantErr { assert.Error(t, err) diff --git a/internal/controller/operator/factory/vmanomaly/vmanomaly_reconcile_test.go b/internal/controller/operator/factory/vmanomaly/vmanomaly_reconcile_test.go index e89a7a45d..50fc4c3df 100644 --- a/internal/controller/operator/factory/vmanomaly/vmanomaly_reconcile_test.go +++ b/internal/controller/operator/factory/vmanomaly/vmanomaly_reconcile_test.go @@ -5,12 +5,16 @@ import ( "testing" "github.com/stretchr/testify/assert" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" vmv1 "github.com/VictoriaMetrics/operator/api/operator/v1" + vmv1beta1 "github.com/VictoriaMetrics/operator/api/operator/v1beta1" "github.com/VictoriaMetrics/operator/internal/controller/operator/factory/build" "github.com/VictoriaMetrics/operator/internal/controller/operator/factory/k8stools" ) @@ -198,7 +202,8 @@ schedulers: // StatefulSet {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, {Verb: "Update", Kind: "StatefulSet", Resource: vmanomalyName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, + {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, // patchSTSCurrentRevision }, }) @@ -247,7 +252,74 @@ schedulers: {Verb: "Get", Kind: "Secret", Resource: vmanomalyName}, // StatefulSet {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, + {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmanomalyName}, // patchSTSCurrentRevision }, }) } + +func TestCreateOrUpdate_Paused(t *testing.T) { + cr := &vmv1.VMAnomaly{ + ObjectMeta: metav1.ObjectMeta{ + Name: "example-anomaly", + Namespace: "default", + }, + Spec: vmv1.VMAnomalySpec{ + ConfigRawYaml: ` +models: + M1: + class: "zscore" + z_threshold: 2.5 + queries: ["q1"] + schedulers: ["S1"] +reader: + queries: + q1: + expr: "sum(up)" +schedulers: + S1: + class: "periodic" + infer_every: "1m" +`, + Reader: &vmv1.VMAnomalyReadersSpec{ + DatasourceURL: "http://reader-url", + SamplingPeriod: "1m", + }, + Writer: &vmv1.VMAnomalyWritersSpec{ + DatasourceURL: "http://writer-url", + }, + CommonAppsParams: vmv1beta1.CommonAppsParams{ + ReplicaCount: ptr.To(int32(1)), + Paused: true, + }, + }, + } + nsn := types.NamespacedName{Namespace: cr.Namespace, Name: cr.PrefixedName()} + fclient := k8stools.GetTestClientWithObjects([]runtime.Object{cr}) + ctx := context.TODO() + build.AddDefaults(fclient.Scheme()) + fclient.Scheme().Default(cr) + + assert.NoError(t, CreateOrUpdate(ctx, cr, fclient)) + + var sts appsv1.StatefulSet + err := fclient.Get(ctx, nsn, &sts) + assert.Error(t, err) + assert.True(t, k8serrors.IsNotFound(err)) + + // unpause and verify reconciliation + cr.Spec.Paused = false + assert.NoError(t, CreateOrUpdate(ctx, cr, fclient)) + err = fclient.Get(ctx, nsn, &sts) + assert.NoError(t, err) + + // pause and update replica count + cr.Spec.Paused = true + cr.Spec.ReplicaCount = ptr.To(int32(2)) + assert.NoError(t, CreateOrUpdate(ctx, cr, fclient)) + + // check that replicas count is not updated + err = fclient.Get(ctx, nsn, &sts) + assert.NoError(t, err) + assert.Equal(t, int32(1), *sts.Spec.Replicas) +} diff --git a/internal/controller/operator/factory/vmcluster/vmcluster.go b/internal/controller/operator/factory/vmcluster/vmcluster.go index c9cd88a14..a56c299f7 100644 --- a/internal/controller/operator/factory/vmcluster/vmcluster.go +++ b/internal/controller/operator/factory/vmcluster/vmcluster.go @@ -171,6 +171,7 @@ func createOrUpdateVMSelect(ctx context.Context, rclient client.Client, cr, prev PatchSpec: func(existingSpec, newSpec *appsv1.StatefulSetSpec) { if cr.Spec.VMSelect.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.VMSelect.ReplicaCount = existingSpec.Replicas } }, } @@ -320,6 +321,7 @@ func createOrUpdateVMInsert(ctx context.Context, rclient client.Client, cr, prev PatchSpec: func(existingSpec, newSpec *appsv1.DeploymentSpec) { if cr.Spec.VMInsert.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.VMInsert.ReplicaCount = existingSpec.Replicas } }, } @@ -431,6 +433,7 @@ func createOrUpdateVMStorage(ctx context.Context, rclient client.Client, cr, pre PatchSpec: func(existingSpec, newSpec *appsv1.StatefulSetSpec) { if cr.Spec.VMStorage.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.VMStorage.ReplicaCount = existingSpec.Replicas } }, } @@ -586,16 +589,15 @@ func makePodSpecForVMSelect(cr *vmv1beta1.VMCluster) (*corev1.PodTemplateSpec, e } } - if cr.Spec.VMStorage != nil && cr.Spec.VMStorage.ReplicaCount != nil { - storageNodeFlag := build.NewFlag("-storageNode", "") - storageNodeIds := cr.AvailableStorageNodeIDs("select") - for idx, i := range storageNodeIds { - storageName := cr.PrefixedName(vmv1beta1.ClusterComponentStorage) - storageNodeFlag.Add(build.PodDNSAddress(storageName, i, cr.Namespace, cr.Spec.VMStorage.VMSelectPort, cr.Spec.ClusterDomainName), idx) - } - totalNodes := len(storageNodeIds) - args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) + storageNodeFlag := build.NewFlag("-storageNode", "") + storageNodeIds := cr.AvailableStorageNodeIDs(vmv1beta1.ClusterComponentSelect) + for idx, i := range storageNodeIds { + storageName := cr.PrefixedName(vmv1beta1.ClusterComponentStorage) + storageNodeFlag.Add(build.PodDNSAddress(storageName, i, cr.Namespace, cr.Spec.VMStorage.VMSelectPort, cr.Spec.ClusterDomainName), idx) } + totalNodes := len(storageNodeIds) + args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) + // selectNode arg add for deployments without HPA // HPA leads to rolling restart for vmselect statefulset in case of replicas count changes if cr.Spec.VMSelect.HPA == nil && cr.Spec.VMSelect.ReplicaCount != nil { @@ -792,16 +794,14 @@ func makePodSpecForVMInsert(cr *vmv1beta1.VMCluster) (*corev1.PodTemplateSpec, e args = append(args, fmt.Sprintf("--clusternativeListenAddr=:%s", cr.Spec.VMInsert.ClusterNativePort)) } - if cr.Spec.VMStorage != nil && cr.Spec.VMStorage.ReplicaCount != nil { - storageNodeFlag := build.NewFlag("-storageNode", "") - storageNodeIds := cr.AvailableStorageNodeIDs("insert") - for idx, i := range storageNodeIds { - storageName := cr.PrefixedName(vmv1beta1.ClusterComponentStorage) - storageNodeFlag.Add(build.PodDNSAddress(storageName, i, cr.Namespace, cr.Spec.VMStorage.VMInsertPort, cr.Spec.ClusterDomainName), idx) - } - totalNodes := len(storageNodeIds) - args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) + storageNodeFlag := build.NewFlag("-storageNode", "") + storageNodeIds := cr.AvailableStorageNodeIDs(vmv1beta1.ClusterComponentInsert) + for idx, i := range storageNodeIds { + storageName := cr.PrefixedName(vmv1beta1.ClusterComponentStorage) + storageNodeFlag.Add(build.PodDNSAddress(storageName, i, cr.Namespace, cr.Spec.VMStorage.VMInsertPort, cr.Spec.ClusterDomainName), idx) } + totalNodes := len(storageNodeIds) + args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) if cr.Spec.ReplicationFactor != nil { args = append(args, fmt.Sprintf("-replicationFactor=%d", *cr.Spec.ReplicationFactor)) diff --git a/internal/controller/operator/factory/vmcluster/vmcluster_reconcile_test.go b/internal/controller/operator/factory/vmcluster/vmcluster_reconcile_test.go index 59fbc4c47..14ad3050c 100644 --- a/internal/controller/operator/factory/vmcluster/vmcluster_reconcile_test.go +++ b/internal/controller/operator/factory/vmcluster/vmcluster_reconcile_test.go @@ -196,13 +196,15 @@ func Test_CreateOrUpdate_Actions(t *testing.T) { // VMStorage {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, // wait for ready + {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, // patchSTSCurrentRevision {Verb: "Get", Kind: "Service", Resource: vmstorageName}, {Verb: "Get", Kind: "VMServiceScrape", Resource: vmstorageName}, // VMSelect {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, // wait for ready + {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, // patchSTSCurrentRevision {Verb: "Get", Kind: "Service", Resource: vmselectName}, {Verb: "Get", Kind: "VMServiceScrape", Resource: vmselectName}, @@ -230,13 +232,15 @@ func Test_CreateOrUpdate_Actions(t *testing.T) { // VMStorage {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, // wait for ready + {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmstorageName}, // patchSTSCurrentRevision {Verb: "Get", Kind: "Service", Resource: vmstorageName}, {Verb: "Get", Kind: "VMServiceScrape", Resource: vmstorageName}, // VMSelect {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, - {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, // wait for ready + {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, // getLatestStsState + {Verb: "Get", Kind: "StatefulSet", Resource: vmselectName}, // patchSTSCurrentRevision {Verb: "Get", Kind: "Service", Resource: vmselectName}, {Verb: "Get", Kind: "VMServiceScrape", Resource: vmselectName}, diff --git a/internal/controller/operator/factory/vtcluster/insert.go b/internal/controller/operator/factory/vtcluster/insert.go index 0bbc70994..c7cca090a 100644 --- a/internal/controller/operator/factory/vtcluster/insert.go +++ b/internal/controller/operator/factory/vtcluster/insert.go @@ -78,6 +78,7 @@ func createOrUpdateVTInsertDeployment(ctx context.Context, rclient client.Client PatchSpec: func(existingSpec, newSpec *appsv1.DeploymentSpec) { if cr.Spec.Insert.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.Insert.ReplicaCount = existingSpec.Replicas } }, } @@ -136,16 +137,13 @@ func buildVTInsertPodSpec(cr *vmv1.VTCluster) (*corev1.PodTemplateSpec, error) { args = append(args, fmt.Sprintf("-loggerFormat=%s", cr.Spec.Insert.LogFormat)) } - if cr.Spec.Storage != nil && cr.Spec.Storage.ReplicaCount != nil { - // TODO: check TLS - storageNodeFlag := build.NewFlag("-storageNode", "") - storageNodeIds := cr.AvailableStorageNodeIDs("insert") - for idx, i := range storageNodeIds { - storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.Storage.Port, cr.Spec.ClusterDomainName), idx) - } - totalNodes := len(storageNodeIds) - args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) + storageNodeFlag := build.NewFlag("-storageNode", "") + storageNodeIds := cr.AvailableStorageNodeIDs(vmv1beta1.ClusterComponentInsert) + for idx, i := range storageNodeIds { + storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.Storage.Port, cr.Spec.ClusterDomainName), idx) } + totalNodes := len(storageNodeIds) + args = build.AppendFlagsToArgs(args, totalNodes, storageNodeFlag) if len(cr.Spec.Insert.ExtraEnvs) > 0 || len(cr.Spec.Insert.ExtraEnvsFrom) > 0 { args = append(args, "-envflag.enable=true") diff --git a/internal/controller/operator/factory/vtcluster/select.go b/internal/controller/operator/factory/vtcluster/select.go index 66374b132..1d926c1b3 100644 --- a/internal/controller/operator/factory/vtcluster/select.go +++ b/internal/controller/operator/factory/vtcluster/select.go @@ -189,6 +189,7 @@ func createOrUpdateVTSelectDeployment(ctx context.Context, rclient client.Client PatchSpec: func(existingSpec, newSpec *appsv1.DeploymentSpec) { if cr.Spec.Select.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.Select.ReplicaCount = existingSpec.Replicas } }, } @@ -244,12 +245,9 @@ func buildVTSelectPodSpec(cr *vmv1.VTCluster) (*corev1.PodTemplateSpec, error) { } storageNodeFlag := build.NewFlag("-storageNode", "") - storageNodeIds := cr.AvailableStorageNodeIDs("select") - if cr.Spec.Storage != nil && cr.Spec.Storage.ReplicaCount != nil { - // TODO: check TLS - for idx, i := range storageNodeIds { - storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.Storage.Port, cr.Spec.ClusterDomainName), idx) - } + storageNodeIds := cr.AvailableStorageNodeIDs(vmv1beta1.ClusterComponentSelect) + for idx, i := range storageNodeIds { + storageNodeFlag.Add(build.PodDNSAddress(cr.PrefixedName(vmv1beta1.ClusterComponentStorage), i, cr.Namespace, cr.Spec.Storage.Port, cr.Spec.ClusterDomainName), idx) } if len(cr.Spec.Select.ExtraStorageNodes) > 0 { for i, node := range cr.Spec.Select.ExtraStorageNodes { diff --git a/internal/controller/operator/factory/vtcluster/storage.go b/internal/controller/operator/factory/vtcluster/storage.go index 9222d7a3e..594a551af 100644 --- a/internal/controller/operator/factory/vtcluster/storage.go +++ b/internal/controller/operator/factory/vtcluster/storage.go @@ -169,6 +169,7 @@ func createOrUpdateVTStorageSTS(ctx context.Context, rclient client.Client, cr, PatchSpec: func(existingSpec, newSpec *appsv1.StatefulSetSpec) { if cr.Spec.Storage.HPA != nil { newSpec.Replicas = existingSpec.Replicas + cr.Spec.Storage.ReplicaCount = existingSpec.Replicas } }, } diff --git a/internal/controller/operator/vlagent_controller.go b/internal/controller/operator/vlagent_controller.go index d495c1b3c..a507da3b4 100644 --- a/internal/controller/operator/vlagent_controller.go +++ b/internal/controller/operator/vlagent_controller.go @@ -60,6 +60,7 @@ func (r *VLAgentReconciler) Init(rclient client.Client, l logr.Logger, sc *runti // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;create,update;list // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=* // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=* +// +kubebuilder:rbac:groups=apps,resources=statefulsets/status,verbs=get;update;patch func (r *VLAgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) { l := r.Log.WithValues("vlagent", req.Name, "namespace", req.Namespace) ctx = logger.AddToContext(ctx, l) diff --git a/internal/controller/operator/vmagent_controller.go b/internal/controller/operator/vmagent_controller.go index eb6da2f5e..4671f4946 100644 --- a/internal/controller/operator/vmagent_controller.go +++ b/internal/controller/operator/vmagent_controller.go @@ -79,6 +79,7 @@ func (r *VMAgentReconciler) Init(rclient client.Client, l logr.Logger, sc *runti // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;create,update;list // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=* // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=* +// +kubebuilder:rbac:groups=apps,resources=statefulsets/status,verbs=get;update;patch // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=* func (r *VMAgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) { l := r.Log.WithValues("vmagent", req.Name, "namespace", req.Namespace) diff --git a/internal/controller/operator/vmalertmanager_controller.go b/internal/controller/operator/vmalertmanager_controller.go index d1bd82f45..824a2e72a 100644 --- a/internal/controller/operator/vmalertmanager_controller.go +++ b/internal/controller/operator/vmalertmanager_controller.go @@ -67,6 +67,7 @@ func (r *VMAlertmanagerReconciler) Scheme() *runtime.Scheme { // +kubebuilder:rbac:groups=operator.victoriametrics.com,resources=vmalertmanagers/status,verbs=get;update;patch // +kubebuilder:rbac:groups=operator.victoriametrics.com,resources=vmalertmanagers/finalizers,verbs=* // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=* +// +kubebuilder:rbac:groups=apps,resources=statefulsets/status,verbs=get;update;patch // +kubebuilder:rbac:groups="",resources=configmaps,verbs=* // +kubebuilder:rbac:groups="",resources=secrets,verbs=* func (r *VMAlertmanagerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) { diff --git a/internal/controller/operator/vmanomaly_controller.go b/internal/controller/operator/vmanomaly_controller.go index 1c252dc47..7222755c5 100644 --- a/internal/controller/operator/vmanomaly_controller.go +++ b/internal/controller/operator/vmanomaly_controller.go @@ -56,6 +56,7 @@ func (r *VMAnomalyReconciler) Init(rclient client.Client, l logr.Logger, sc *run // +kubebuilder:rbac:groups=operator.victoriametrics.com,resources=vmanomalies/finalizers,verbs=* // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=* // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=* +// +kubebuilder:rbac:groups=apps,resources=statefulsets/status,verbs=get;update;patch // +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;create,update;list // +kubebuilder:rbac:groups="",resources=events,verbs=* // +kubebuilder:rbac:groups="",resources=pods,verbs=* diff --git a/internal/controller/operator/vmcluster_controller.go b/internal/controller/operator/vmcluster_controller.go index d471bb22c..4208cea1b 100644 --- a/internal/controller/operator/vmcluster_controller.go +++ b/internal/controller/operator/vmcluster_controller.go @@ -44,6 +44,7 @@ func (r *VMClusterReconciler) Scheme() *runtime.Scheme { // +kubebuilder:rbac:groups=operator.victoriametrics.com,resources=vmclusters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=operator.victoriametrics.com,resources=vmclusters/finalizers,verbs=* // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=* +// +kubebuilder:rbac:groups=apps,resources=statefulsets/status,verbs=get;update;patch func (r *VMClusterReconciler) Reconcile(ctx context.Context, request ctrl.Request) (result ctrl.Result, err error) { l := r.Log.WithValues("vmcluster", request.Name, "namespace", request.Namespace) ctx = logger.AddToContext(ctx, l) diff --git a/test/e2e/upgrade/upgrade_test.go b/test/e2e/upgrade/upgrade_test.go index 46b560438..5be5a63ec 100644 --- a/test/e2e/upgrade/upgrade_test.go +++ b/test/e2e/upgrade/upgrade_test.go @@ -21,11 +21,12 @@ import ( vmv1 "github.com/VictoriaMetrics/operator/api/operator/v1" vmv1alpha1 "github.com/VictoriaMetrics/operator/api/operator/v1alpha1" vmv1beta1 "github.com/VictoriaMetrics/operator/api/operator/v1beta1" + "github.com/VictoriaMetrics/operator/test/e2e" "github.com/VictoriaMetrics/operator/test/e2e/suite" ) var ( - vmanomaly = &vmv1.VMAnomaly{ + _ = &vmv1.VMAnomaly{ Spec: vmv1.VMAnomalySpec{ Reader: &vmv1.VMAnomalyReadersSpec{ DatasourceURL: "http://vmsingle-anomaly.svc:8428", @@ -59,7 +60,7 @@ var ( {URL: "http://localhost:8428/api/v1/write"}, }, CommonConfigReloaderParams: vmv1beta1.CommonConfigReloaderParams{ - ConfigReloaderImage: "quay.io/victoriametrics/operator:config-reloader-v0.65.0", + ConfigReloaderImage: configReloaderImage(), }, CommonAppsParams: vmv1beta1.CommonAppsParams{ ReplicaCount: ptr.To[int32](1), @@ -94,14 +95,30 @@ var ( corev1.ResourceMemory: resource.MustParse("128Mi"), }, }, - TerminationGracePeriodSeconds: ptr.To(int64(1)), - }, + TerminationGracePeriodSeconds: ptr.To(int64(1)), }, - } - vmauth = &vmv1beta1.VMAuth{ + }, +} +vlagentK8sCollector = withVersion(vlagent, func(cr *vmv1.VLAgent, version string) { + cr.Spec.K8sCollector.Enabled = true + cr.Spec.ServiceAccountName = "vlagent-collector" + tmpPath := fmt.Sprintf("/var/lib/vlagent-data-%s", version) + cr.Spec.TmpDataPath = ptr.To(tmpPath) + cr.Spec.Volumes = append(cr.Spec.Volumes, corev1.Volume{ + Name: "tmp-data", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: tmpPath}, + }, + }) + cr.Spec.VolumeMounts = append(cr.Spec.VolumeMounts, corev1.VolumeMount{ + Name: "tmp-data", + MountPath: tmpPath, + }) +}) +vmauth = &vmv1beta1.VMAuth{ Spec: vmv1beta1.VMAuthSpec{ CommonConfigReloaderParams: vmv1beta1.CommonConfigReloaderParams{ - ConfigReloaderImage: "quay.io/victoriametrics/operator:config-reloader-v0.65.0", + ConfigReloaderImage: configReloaderImage(), }, CommonAppsParams: vmv1beta1.CommonAppsParams{ ReplicaCount: ptr.To[int32](1), @@ -193,7 +210,7 @@ var ( vmalert = &vmv1beta1.VMAlert{ Spec: vmv1beta1.VMAlertSpec{ CommonConfigReloaderParams: vmv1beta1.CommonConfigReloaderParams{ - ConfigReloaderImage: "quay.io/victoriametrics/operator:config-reloader-v0.65.0", + ConfigReloaderImage: configReloaderImage(), }, CommonAppsParams: vmv1beta1.CommonAppsParams{ ReplicaCount: ptr.To[int32](1), @@ -309,7 +326,7 @@ var ( vmalertmanager = &vmv1beta1.VMAlertmanager{ Spec: vmv1beta1.VMAlertmanagerSpec{ CommonConfigReloaderParams: vmv1beta1.CommonConfigReloaderParams{ - ConfigReloaderImage: "quay.io/victoriametrics/operator:config-reloader-v0.65.0", + ConfigReloaderImage: configReloaderImage(), }, CommonAppsParams: vmv1beta1.CommonAppsParams{ ReplicaCount: ptr.To[int32](1), @@ -409,12 +426,24 @@ type object[T any] interface { DeepCopy() T } -func with[T object[T]](cr T, opts ...func(T)) T { - obj := cr.DeepCopy() - for _, o := range opts { - o(obj) +func with[T object[T]](cr T, opts ...func(T)) func(string) client.Object { + return func(_ string) client.Object { + obj := cr.DeepCopy() + for _, o := range opts { + o(obj) + } + return any(obj).(client.Object) + } +} + +func withVersion[T object[T]](cr T, opts ...func(T, string)) func(string) client.Object { + return func(version string) client.Object { + obj := cr.DeepCopy() + for _, o := range opts { + o(obj, version) + } + return any(obj).(client.Object) } - return obj } var ( @@ -424,8 +453,9 @@ var ( ) type crVersionPair struct { - version string - cr client.Object + version string + cr func(string) client.Object + isEnterprise bool } type entry struct { @@ -439,14 +469,18 @@ func entries(es []entry) []TableEntry { var result []TableEntry for _, e := range es { for _, p := range e.pairs { - obj := p.cr.DeepCopyObject().(client.Object) - result = append(result, Entry(fmt.Sprintf("from %s: %s (%T)", p.version, e.name, obj), p.version, e.genDeps, []client.Object{obj}, e.envs)) + obj := p.cr(p.version) + result = append(result, Entry(fmt.Sprintf("from %s: %s (%T)", p.version, e.name, obj), p.version, e.genDeps, []client.Object{obj}, e.envs, p.isEnterprise)) } } return result } -func ensureNoPodRollout(version string, genDeps func(string) []client.Object, objs []client.Object, envs map[string]string) { +func ensureNoPodRollout(version string, genDeps func(string) []client.Object, objs []client.Object, envs map[string]string, isEnterprise bool) { + if isEnterprise && e2e.LICENSE_KEY == "" { + Skip("skipping enterprise test: LICENSE_KEY is not set") + } + namespace := createRandomNamespace(ctx, k8sClient) previousOperatorImage := fmt.Sprintf("quay.io/%s:%s", operatorImageBase, version) @@ -652,10 +686,16 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { cr.Spec.StatefulMode = true })}, {version: "v0.68.4", cr: with(vlagent)}, - {version: "v0.68.4", cr: with(vlagent, func(cr *vmv1.VLAgent) { - cr.Spec.K8sCollector.Enabled = true - cr.Spec.ServiceAccountName = "vlagent-collector" + {version: "v0.68.4", cr: vlagentK8sCollector}, + {version: "v0.68.5", cr: with(vmagent)}, + {version: "v0.68.5", cr: with(vmagent, func(cr *vmv1beta1.VMAgent) { + cr.Spec.DaemonSetMode = true })}, + {version: "v0.68.5", cr: with(vmagent, func(cr *vmv1beta1.VMAgent) { + cr.Spec.StatefulMode = true + })}, + {version: "v0.68.5", cr: with(vlagent)}, + {version: "v0.68.5", cr: vlagentK8sCollector}, }, }, // nolint:dupl @@ -663,33 +703,31 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { name: "VMAlert/VMAuth/VMAlertmanager/VMAnomaly", genDeps: func(ns string) []client.Object { return []client.Object{ - with(vmsingle, func(cr *vmv1beta1.VMSingle) { - cr.Name = "anomaly" - cr.Namespace = ns - }), + with(vmsingle, func(cr *vmv1beta1.VMSingle) { + cr.Name = "anomaly" + cr.Namespace = ns + })("latest"), } }, pairs: []crVersionPair{ {version: "v0.68.0", cr: with(vmalert)}, {version: "v0.68.0", cr: with(vmauth)}, {version: "v0.68.0", cr: with(vmalertmanager)}, - {version: "v0.68.0", cr: with(vmanomaly)}, {version: "v0.68.1", cr: with(vmalert)}, {version: "v0.68.1", cr: with(vmauth)}, {version: "v0.68.1", cr: with(vmalertmanager)}, - {version: "v0.68.1", cr: with(vmanomaly)}, {version: "v0.68.2", cr: with(vmalert)}, {version: "v0.68.2", cr: with(vmauth)}, {version: "v0.68.2", cr: with(vmalertmanager)}, - {version: "v0.68.2", cr: with(vmanomaly)}, {version: "v0.68.3", cr: with(vmalert)}, {version: "v0.68.3", cr: with(vmauth)}, {version: "v0.68.3", cr: with(vmalertmanager)}, - {version: "v0.68.3", cr: with(vmanomaly)}, {version: "v0.68.4", cr: with(vmalert)}, {version: "v0.68.4", cr: with(vmauth)}, {version: "v0.68.4", cr: with(vmalertmanager)}, - {version: "v0.68.4", cr: with(vmanomaly)}, + {version: "v0.68.5", cr: with(vmalert)}, + {version: "v0.68.5", cr: with(vmauth)}, + {version: "v0.68.5", cr: with(vmalertmanager)}, }, }, // nolint:dupl @@ -717,6 +755,9 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { {version: "v0.68.4", cr: with(vmsingle)}, {version: "v0.68.4", cr: with(vtsingle)}, {version: "v0.68.4", cr: with(vlsingle)}, + {version: "v0.68.5", cr: with(vmsingle)}, + {version: "v0.68.5", cr: with(vtsingle)}, + {version: "v0.68.5", cr: with(vlsingle)}, }, }, // nolint:dupl @@ -751,6 +792,10 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { {version: "v0.68.4", cr: with(vlcluster, func(cr *vmv1.VLCluster) { cr.Spec.RequestsLoadBalancer.Enabled = true })}, + {version: "v0.68.5", cr: with(vlcluster)}, + {version: "v0.68.5", cr: with(vlcluster, func(cr *vmv1.VLCluster) { + cr.Spec.RequestsLoadBalancer.Enabled = true + })}, }, }, // nolint:dupl @@ -785,6 +830,10 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { {version: "v0.68.4", cr: with(vtcluster, func(cr *vmv1.VTCluster) { cr.Spec.RequestsLoadBalancer.Enabled = true })}, + {version: "v0.68.5", cr: with(vtcluster)}, + {version: "v0.68.5", cr: with(vtcluster, func(cr *vmv1.VTCluster) { + cr.Spec.RequestsLoadBalancer.Enabled = true + })}, }, }, // nolint:dupl @@ -798,13 +847,14 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { {version: "v0.68.2", cr: with(vmcluster)}, {version: "v0.68.3", cr: with(vmcluster)}, {version: "v0.68.4", cr: with(vmcluster)}, + {version: "v0.68.5", cr: with(vmcluster)}, }, }, // nolint:dupl { name: "VMCluster with VMBackup", pairs: []crVersionPair{ - {version: "v0.65.0", cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { + {version: "v0.65.0", isEnterprise: true, cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { cr.Spec.RequestsLoadBalancer.Enabled = true cr.Spec.VMStorage.Image.Tag = "v1.136.0-enterprise-cluster" cr.Spec.VMSelect.Image.Tag = "v1.136.0-enterprise-cluster" @@ -827,7 +877,7 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { }, } })}, - {version: "v0.68.3", cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { + {version: "v0.68.3", isEnterprise: true, cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { cr.Spec.RequestsLoadBalancer.Enabled = true cr.Spec.VMStorage.Image.Tag = "v1.136.0-enterprise-cluster" cr.Spec.VMSelect.Image.Tag = "v1.136.0-enterprise-cluster" @@ -850,7 +900,7 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { }, } })}, - {version: "v0.68.4", cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { + {version: "v0.68.4", isEnterprise: true, cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { cr.Spec.RequestsLoadBalancer.Enabled = true cr.Spec.VMStorage.Image.Tag = "v1.136.0-enterprise-cluster" cr.Spec.VMSelect.Image.Tag = "v1.136.0-enterprise-cluster" @@ -896,6 +946,29 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { }, } })}, + {version: "v0.68.5", isEnterprise: true, cr: with(vmcluster, func(cr *vmv1beta1.VMCluster) { + cr.Spec.RequestsLoadBalancer.Enabled = true + cr.Spec.VMStorage.Image.Tag = "v1.136.0-enterprise-cluster" + cr.Spec.VMSelect.Image.Tag = "v1.136.0-enterprise-cluster" + cr.Spec.VMInsert.Image.Tag = "v1.136.0-enterprise-cluster" + cr.Spec.RequestsLoadBalancer.Spec.Image.Tag = "v1.136.0-enterprise" + cr.Spec.VMStorage.VMBackup = &vmv1beta1.VMBackup{ + Destination: "fs:///tmp", + DestinationDisableSuffixAdd: true, + Image: vmv1beta1.Image{ + Tag: "v1.136.0-enterprise", + }, + AcceptEULA: true, + } + cr.Spec.License = &vmv1beta1.License{ + KeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "license", + }, + Key: "key", + }, + } + })}, }, envs: map[string]string{ "VM_LOOPBACK": "localhost", @@ -905,11 +978,7 @@ var _ = Describe("operator upgrade", Label("upgrade"), func() { { name: "VMDistributed", pairs: []crVersionPair{ - {version: "v0.68.0", cr: with(vmdistributed)}, - {version: "v0.68.1", cr: with(vmdistributed)}, - {version: "v0.68.2", cr: with(vmdistributed)}, - {version: "v0.68.3", cr: with(vmdistributed)}, - {version: "v0.68.4", cr: with(vmdistributed)}, + {version: "v0.68.5", cr: with(vmdistributed)}, }, }, })) diff --git a/test/e2e/upgrade/utils.go b/test/e2e/upgrade/utils.go index 22654ea6b..a7c6304b4 100644 --- a/test/e2e/upgrade/utils.go +++ b/test/e2e/upgrade/utils.go @@ -4,7 +4,9 @@ import ( "context" "fmt" "maps" + "os" "os/exec" + "strings" "time" "github.com/google/go-cmp/cmp" //nolint:staticcheck @@ -31,6 +33,18 @@ const ( operatorImageBase = "victoriametrics/operator" ) +func configReloaderImage() string { + if image := os.Getenv("CONFIG_RELOADER_IMAGE"); image != "" { + return image + } + image := os.Getenv("OPERATOR_IMAGE") + tagIdx := strings.LastIndex(image, ":") + if tagIdx > strings.LastIndex(image, "/") { + return image[:tagIdx+1] + "config-reloader-" + image[tagIdx+1:] + } + return "" +} + // operatorEnvVars builds the env var list for the operator pod, // TODO[vrutkovs]: do we need to copy it? func operatorEnvVars(watchNamespace string, extraEnvs map[string]string) []corev1.EnvVar { @@ -41,7 +55,7 @@ func operatorEnvVars(watchNamespace string, extraEnvs map[string]string) []corev "VM_PODWAITREADYTIMEOUT": "20s", "VM_PODWAITREADYINTERVALCHECK": "1s", "VM_APPREADYTIMEOUT": "50s", - "VM_CONFIG_RELOADER_IMAGE": "quay.io/victoriametrics/operator:config-reloader-v0.68.3", + "VM_CONFIG_RELOADER_IMAGE": configReloaderImage(), "WATCH_NAMESPACE": watchNamespace, "VM_CONTAINERREGISTRY": "quay.io", } @@ -69,6 +83,7 @@ func operatorEnvVars(watchNamespace string, extraEnvs map[string]string) []corev "MEM": "20Mi", } for _, prefix := range resourceEnvsPrefixes { + envs[fmt.Sprintf("VM_%s_USEDEFAULTRESOURCES", prefix)] = "true" for _, t := range []string{"LIMIT", "REQUEST"} { for rn, rv := range resources { envName := fmt.Sprintf("VM_%s_RESOURCE_%s_%s", prefix, t, rn) @@ -101,7 +116,9 @@ func updateOperator(ctx context.Context, k8sClient client.Client, operatorImage, var dep appsv1.Deployment if err := k8sClient.Get(ctx, nsn, &dep); err == nil { - dep.Spec.Template.Spec.Containers[0].Image = operatorImage + container := &dep.Spec.Template.Spec.Containers[0] + container.Image = operatorImage + container.Env = operatorEnvVars(watchNamespace, envs) Expect(k8sClient.Update(ctx, &dep)).ToNot(HaveOccurred()) } else { By("creating ServiceAccount for operator") diff --git a/test/e2e/vmdistributed_test.go b/test/e2e/vmdistributed_test.go index 6ad75f1c2..5acaa7811 100644 --- a/test/e2e/vmdistributed_test.go +++ b/test/e2e/vmdistributed_test.go @@ -404,81 +404,6 @@ var _ = Describe("e2e VMDistributed", Label("vm", "vmdistributed"), func() { Expect(ownerRef.Name).To(Equal(cr.Name)) }) - It("should successfully create a VMDistributed with VMCluster references and override spec", func() { - By("creating an initial VMClusters") - nsn.Name = "vmd-override-clusters" - zonesCount := 2 - zs := make([]vmv1alpha1.VMDistributedZone, zonesCount) - vmClusters := make([]*vmv1beta1.VMCluster, zonesCount) - vmAgents := make([]*vmv1beta1.VMAgent, zonesCount) - vmClusterFn := []func(*vmv1beta1.VMClusterSpec){ - func(s *vmv1beta1.VMClusterSpec) { - s.RetentionPeriod = "1w" - }, - func(s *vmv1beta1.VMClusterSpec) { - s.RetentionPeriod = "2w" - }, - } - for i := range zs { - objMeta := metav1.ObjectMeta{ - Namespace: namespace, - Name: fmt.Sprintf("%s-%d", nsn.Name, i+1), - } - zs[i].Name = objMeta.Name - zs[i].VMCluster.Spec = genVMClusterSpec(vmClusterFn[i]) - zs[i].VMCluster.Name = objMeta.Name - zs[i].VMAgent.Name = objMeta.Name - vmClusters[i] = &vmv1beta1.VMCluster{ - ObjectMeta: objMeta, - Spec: genVMClusterSpec(), - } - vmAgents[i] = &vmv1beta1.VMAgent{ - ObjectMeta: objMeta, - Spec: genVMAgentSpec(), - } - } - - var wg sync.WaitGroup - createVMClusters(ctx, &wg, k8sClient, vmClusters...) - createVMAgents(ctx, &wg, k8sClient, vmAgents...) - createVMAuth(ctx, &wg, k8sClient, nsn.Name, namespace) - wg.Wait() - - By("creating a VMDistributed referencing the existing VMCluster with an override spec") - cr := &vmv1alpha1.VMDistributed{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: namespace, - Name: nsn.Name, - }, - Spec: vmv1alpha1.VMDistributedSpec{ - ZoneCommon: vmv1alpha1.VMDistributedZoneCommon{ - ReadyTimeout: &metav1.Duration{Duration: 2 * time.Minute}, - UpdatePause: &metav1.Duration{Duration: 1 * time.Second}, - }, - VMAuth: vmv1alpha1.VMDistributedAuth{Name: nsn.Name}, - Zones: zs, - }, - } - DeferCleanup(func() { - Expect(finalize.SafeDelete(ctx, k8sClient, cr)).ToNot(HaveOccurred()) - }) - Expect(k8sClient.Create(ctx, cr)).ToNot(HaveOccurred()) - - By("waiting for VMDistributed to become operational") - Eventually(func() error { - return expectObjectStatusOperational(ctx, k8sClient, &vmv1alpha1.VMDistributed{}, nsn) - }, eventualDistributedExpandingTimeout).WithContext(ctx).ShouldNot(HaveOccurred()) - verifyOwnerReferences(ctx, cr, vmClusters, namespace) - - By("verifying that the referenced VMClusters have the override applied") - var updatedCluster1, updatedCluster2 vmv1beta1.VMCluster - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: vmClusters[0].Name, Namespace: namespace}, &updatedCluster1)).ToNot(HaveOccurred()) - Expect(updatedCluster1.Spec.RetentionPeriod).To(Equal("1w")) - - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: vmClusters[1].Name, Namespace: namespace}, &updatedCluster2)).ToNot(HaveOccurred()) - Expect(updatedCluster2.Spec.RetentionPeriod).To(Equal("2w")) - }) - It("should apply global overrides before cluster-specific overrides", func() { By("creating initial VMClusters") nsn.Name = "vmd-global-override"