Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
* [BUGFIX] Compactor: Fix stale `cortex_bucket_index_last_successful_update_timestamp_seconds` metric not being cleaned up when tenant ownership changes due to ring rebalancing. This caused false alarms on bucket index update rate when a tenant moved between compactors. #7485
* [BUGFIX] Security: Fix stored XSS vulnerability in Alertmanager and Store Gateway status pages by replacing `text/template` with `html/template`. #7512
* [BUGFIX] Security: Limit decompressed gzip output in `ParseProtoReader` and OTLP ingestion path. The decompressed body is now capped by `-distributor.otlp-max-recv-msg-size`. #7515
* [BUGFIX] Tenant Federation: Fix regex resolver clearing known users list when user scan fails. #7534

## 1.21.0 2026-04-24

Expand Down
1 change: 1 addition & 0 deletions pkg/querier/tenantfederation/regex_resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ func (r *RegexResolver) running(ctx context.Context) error {
active, deleting, _, err := r.userScanner.ScanUsers(ctx)
if err != nil {
level.Error(r.logger).Log("msg", "failed to discover users from bucket", "err", err)
continue
}
Comment thread
friedrichg marked this conversation as resolved.

newUsers := append(active, deleting...)
Expand Down
114 changes: 114 additions & 0 deletions pkg/querier/tenantfederation/regex_resolver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"strings"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -417,6 +418,119 @@ func Test_RegexResolver_CacheNotPurgedWhenUsersUnchanged(t *testing.T) {
require.Equal(t, 1, cacheLen, "cache should not be purged when the set of users has not changed")
}

// toggleableScanner wraps a real scanner and can be flipped into a failure mode
// to simulate transient bucket-scan errors observed by RegexResolver.running().
type toggleableScanner struct {
inner users.Scanner

mu sync.Mutex
failing bool
callsErr int
}

func (t *toggleableScanner) setFailing(v bool) {
t.mu.Lock()
defer t.mu.Unlock()
t.failing = v
}

func (t *toggleableScanner) failedCalls() int {
t.mu.Lock()
defer t.mu.Unlock()
return t.callsErr
}

func (t *toggleableScanner) ScanUsers(ctx context.Context) ([]string, []string, []string, error) {
t.mu.Lock()
failing := t.failing
if failing {
t.callsErr++
}
t.mu.Unlock()

if failing {
return nil, nil, nil, errors.New("simulated scan failure")
}
return t.inner.ScanUsers(ctx)
}

func Test_RegexResolver_ScanFailurePreservesState(t *testing.T) {
reg := prometheus.NewRegistry()
existingTenants := []string{"user-1", "user-2"}

bucketClient := &bucket.ClientMock{}
bucketClient.MockIter("", existingTenants, nil)
bucketClient.MockIter("__markers__", []string{}, nil)
for _, tenant := range existingTenants {
bucketClient.MockExists(users.GetGlobalDeletionMarkPath(tenant), false, nil)
bucketClient.MockExists(users.GetLocalDeletionMarkPath(tenant), false, nil)
}

bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) {
return bucketClient, nil
}

scannerCfg := users.UsersScannerConfig{Strategy: users.UserScanStrategyList}
cfg := Config{UserSyncInterval: 100 * time.Millisecond, MaxTenant: 0, RegexCacheSize: 10}

regexResolver, err := NewRegexResolver(scannerCfg, cfg, reg, bucketClientFactory, log.NewNopLogger())
require.NoError(t, err)

// Swap the scanner BEFORE starting the service so there's no data race with running().
toggle := &toggleableScanner{inner: regexResolver.userScanner}
regexResolver.userScanner = toggle

require.NoError(t, services.StartAndAwaitRunning(context.Background(), regexResolver))
defer services.StopAndAwaitTerminated(context.Background(), regexResolver) //nolint:errcheck

// Wait for the first successful sync to populate knownUsers + metrics.
test.Poll(t, 10*time.Second, true, func() any {
return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0 &&
testutil.ToFloat64(regexResolver.discoveredUsers) == float64(len(existingTenants))
})

// Populate the matched cache.
ctx := user.InjectOrgID(context.Background(), "user-.+")
orgIDs, err := regexResolver.TenantIDs(ctx)
require.NoError(t, err)
require.Equal(t, []string{"user-1", "user-2"}, orgIDs)
require.Equal(t, 1, regexResolver.matchedCache.Len())

// Flip the scanner into failure mode, then wait until at least one failed
// scan has actually been observed by the loop. Snapshotting the metrics
// *after* this point avoids a race where a successful tick could fire
// between the snapshot and the toggle flip.
toggle.setFailing(true)
test.Poll(t, 10*time.Second, true, func() any {
return toggle.failedCalls() >= 1
})
lastRunAtFirstFailure := testutil.ToFloat64(regexResolver.lastUpdateUserRun)
discoveredAtFirstFailure := testutil.ToFloat64(regexResolver.discoveredUsers)

// Wait for additional failed scans to confirm the loop iterated past the
// failure point multiple times.
test.Poll(t, 10*time.Second, true, func() any {
return toggle.failedCalls() >= 3
})

// The fix under test: knownUsers and the matched cache must be preserved,
// and success-only metrics must not be updated.
regexResolver.RLock()
require.Equal(t, existingTenants, regexResolver.knownUsers, "knownUsers must be preserved on scan failure")
require.Equal(t, 1, regexResolver.matchedCache.Len(), "matched cache must not be purged on scan failure")
regexResolver.RUnlock()

require.Equal(t, lastRunAtFirstFailure, testutil.ToFloat64(regexResolver.lastUpdateUserRun),
"lastUpdateUserRun must not advance on scan failure")
require.Equal(t, discoveredAtFirstFailure, testutil.ToFloat64(regexResolver.discoveredUsers),
"discoveredUsers must not change on scan failure")

// Repeating the query must still resolve from cached state.
orgIDs, err = regexResolver.TenantIDs(ctx)
require.NoError(t, err)
require.Equal(t, []string{"user-1", "user-2"}, orgIDs)
}

func BenchmarkRegexResolver_TenantIDs(b *testing.B) {
numUsers := 1000
existingTenants := make([]string, numUsers)
Expand Down
Loading