From ea6f6552c6bd365773f6d046b314e7fe51e679e8 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Thu, 21 May 2026 15:38:13 +0900 Subject: [PATCH 1/3] add continue on error in user discovery in regex resolver Signed-off-by: SungJin1212 --- pkg/querier/tenantfederation/regex_resolver.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/querier/tenantfederation/regex_resolver.go b/pkg/querier/tenantfederation/regex_resolver.go index 64eee70fa8..9fb2bc33ae 100644 --- a/pkg/querier/tenantfederation/regex_resolver.go +++ b/pkg/querier/tenantfederation/regex_resolver.go @@ -125,6 +125,7 @@ func (r *RegexResolver) running(ctx context.Context) error { active, deleting, _, err := r.userScanner.ScanUsers(ctx) if err != nil { level.Error(r.logger).Log("msg", "failed to discover users from bucket", "err", err) + continue } newUsers := append(active, deleting...) From fa8d29744eaf9101489cecacb102ac8d1788e10d Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Thu, 21 May 2026 15:40:02 +0900 Subject: [PATCH 2/3] changelog Signed-off-by: SungJin1212 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d5e891243..c539eb4973 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ * [BUGFIX] Compactor: Fix stale `cortex_bucket_index_last_successful_update_timestamp_seconds` metric not being cleaned up when tenant ownership changes due to ring rebalancing. This caused false alarms on bucket index update rate when a tenant moved between compactors. #7485 * [BUGFIX] Security: Fix stored XSS vulnerability in Alertmanager and Store Gateway status pages by replacing `text/template` with `html/template`. #7512 * [BUGFIX] Security: Limit decompressed gzip output in `ParseProtoReader` and OTLP ingestion path. The decompressed body is now capped by `-distributor.otlp-max-recv-msg-size`. #7515 +* [BUGFIX] Tenant Federation: Fix regex resolver clearing known users list when user scan fails. #7534 ## 1.21.0 2026-04-24 From e53b021bd4611f1660a1939db0812fd72cf7e9fb Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Thu, 21 May 2026 21:54:12 +0900 Subject: [PATCH 3/3] add unit test Signed-off-by: SungJin1212 --- .../tenantfederation/regex_resolver_test.go | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/pkg/querier/tenantfederation/regex_resolver_test.go b/pkg/querier/tenantfederation/regex_resolver_test.go index f8d08d67e0..92285e835e 100644 --- a/pkg/querier/tenantfederation/regex_resolver_test.go +++ b/pkg/querier/tenantfederation/regex_resolver_test.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "strings" + "sync" "testing" "time" @@ -417,6 +418,119 @@ func Test_RegexResolver_CacheNotPurgedWhenUsersUnchanged(t *testing.T) { require.Equal(t, 1, cacheLen, "cache should not be purged when the set of users has not changed") } +// toggleableScanner wraps a real scanner and can be flipped into a failure mode +// to simulate transient bucket-scan errors observed by RegexResolver.running(). +type toggleableScanner struct { + inner users.Scanner + + mu sync.Mutex + failing bool + callsErr int +} + +func (t *toggleableScanner) setFailing(v bool) { + t.mu.Lock() + defer t.mu.Unlock() + t.failing = v +} + +func (t *toggleableScanner) failedCalls() int { + t.mu.Lock() + defer t.mu.Unlock() + return t.callsErr +} + +func (t *toggleableScanner) ScanUsers(ctx context.Context) ([]string, []string, []string, error) { + t.mu.Lock() + failing := t.failing + if failing { + t.callsErr++ + } + t.mu.Unlock() + + if failing { + return nil, nil, nil, errors.New("simulated scan failure") + } + return t.inner.ScanUsers(ctx) +} + +func Test_RegexResolver_ScanFailurePreservesState(t *testing.T) { + reg := prometheus.NewRegistry() + existingTenants := []string{"user-1", "user-2"} + + bucketClient := &bucket.ClientMock{} + bucketClient.MockIter("", existingTenants, nil) + bucketClient.MockIter("__markers__", []string{}, nil) + for _, tenant := range existingTenants { + bucketClient.MockExists(users.GetGlobalDeletionMarkPath(tenant), false, nil) + bucketClient.MockExists(users.GetLocalDeletionMarkPath(tenant), false, nil) + } + + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { + return bucketClient, nil + } + + scannerCfg := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} + cfg := Config{UserSyncInterval: 100 * time.Millisecond, MaxTenant: 0, RegexCacheSize: 10} + + regexResolver, err := NewRegexResolver(scannerCfg, cfg, reg, bucketClientFactory, log.NewNopLogger()) + require.NoError(t, err) + + // Swap the scanner BEFORE starting the service so there's no data race with running(). + toggle := &toggleableScanner{inner: regexResolver.userScanner} + regexResolver.userScanner = toggle + + require.NoError(t, services.StartAndAwaitRunning(context.Background(), regexResolver)) + defer services.StopAndAwaitTerminated(context.Background(), regexResolver) //nolint:errcheck + + // Wait for the first successful sync to populate knownUsers + metrics. + test.Poll(t, 10*time.Second, true, func() any { + return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0 && + testutil.ToFloat64(regexResolver.discoveredUsers) == float64(len(existingTenants)) + }) + + // Populate the matched cache. + ctx := user.InjectOrgID(context.Background(), "user-.+") + orgIDs, err := regexResolver.TenantIDs(ctx) + require.NoError(t, err) + require.Equal(t, []string{"user-1", "user-2"}, orgIDs) + require.Equal(t, 1, regexResolver.matchedCache.Len()) + + // Flip the scanner into failure mode, then wait until at least one failed + // scan has actually been observed by the loop. Snapshotting the metrics + // *after* this point avoids a race where a successful tick could fire + // between the snapshot and the toggle flip. + toggle.setFailing(true) + test.Poll(t, 10*time.Second, true, func() any { + return toggle.failedCalls() >= 1 + }) + lastRunAtFirstFailure := testutil.ToFloat64(regexResolver.lastUpdateUserRun) + discoveredAtFirstFailure := testutil.ToFloat64(regexResolver.discoveredUsers) + + // Wait for additional failed scans to confirm the loop iterated past the + // failure point multiple times. + test.Poll(t, 10*time.Second, true, func() any { + return toggle.failedCalls() >= 3 + }) + + // The fix under test: knownUsers and the matched cache must be preserved, + // and success-only metrics must not be updated. + regexResolver.RLock() + require.Equal(t, existingTenants, regexResolver.knownUsers, "knownUsers must be preserved on scan failure") + require.Equal(t, 1, regexResolver.matchedCache.Len(), "matched cache must not be purged on scan failure") + regexResolver.RUnlock() + + require.Equal(t, lastRunAtFirstFailure, testutil.ToFloat64(regexResolver.lastUpdateUserRun), + "lastUpdateUserRun must not advance on scan failure") + require.Equal(t, discoveredAtFirstFailure, testutil.ToFloat64(regexResolver.discoveredUsers), + "discoveredUsers must not change on scan failure") + + // Repeating the query must still resolve from cached state. + orgIDs, err = regexResolver.TenantIDs(ctx) + require.NoError(t, err) + require.Equal(t, []string{"user-1", "user-2"}, orgIDs) +} + func BenchmarkRegexResolver_TenantIDs(b *testing.B) { numUsers := 1000 existingTenants := make([]string, numUsers)