Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file.

## Unreleased

- e2e/qa: remove client-side capacity pre-filtering from `ValidDevices`, because the QA user pubkey bypasses capacity limits using the serviceability global-config qa-allowlist. Individual device failures no longer fail the test; instead, overall and per-host failure rates are evaluated after all batches and the test only fails if either exceeds `--failure-threshold` (default 10%) or `--per-host-failure-threshold` (default 20%).

## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22

### Breaking

### Changes


- Smartcontract
- Deprecate the 13 contributor-side program instructions whose only client was the now-deleted activator: `ActivateDevice` (21), `RejectDevice` (22), `CloseAccountDevice` (27), `ActivateLink` (29), `RejectLink` (30), `CloseAccountLink` (35), `ActivateMulticastGroup` (47), `RejectMulticastGroup` (48), `DeactivateMulticastGroup` (53), `ActivateDeviceInterface` (72), `RemoveDeviceInterface` (75), `UnlinkDeviceInterface` (77), and `RejectDeviceInterface` (78). Dispatch arms now short-circuit to `DoubleZeroError::Deprecated` (custom code 67); processor files and argument structs are removed. Borsh variant tags are preserved (unit variants) so the wire format is unchanged — old clients receive a deterministic deprecation error rather than an unknown-instruction decode failure. Bumps `MIN_COMPATIBLE_VERSION` to `0.15.0` (the `client/v0.14.1` git tag was a patch release built from a commit whose workspace Cargo version was still `0.14.0`, so the v0.14.1 binary self-reports as 0.14.0 in its startup version check; v0.15.0 is the first release whose embedded version actually satisfies the intended ≥ 0.14.1 gate). Gated on onchain `ProgramConfig.min_compatible_version ≥ 0.15.0` ([#3623](https://github.com/malbeclabs/doublezero/issues/3623))
- Deprecate the `ActivateUser`, `RejectUser`, `CloseAccountUser`, and `BanUser` user-lifecycle program instructions: dispatch arms now return `DoubleZeroError::Deprecated` (custom code 67), and the processor files / argument structs are removed. Borsh variant tags 37/38/43/45 are preserved so the wire format is unchanged. The activator was the only client of all four — `CreateUser` has been atomic-to-`Activated` since RFC-11, `closeaccount` was activator-driven only, and `RequestBanUser` is now atomic. Gated on onchain `min_compatible_version ≥ 0.12.0` ([#3622](https://github.com/malbeclabs/doublezero/issues/3622))
Expand Down
17 changes: 16 additions & 1 deletion e2e/internal/qa/client_unicast.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,17 @@ func (c *Client) ConnectUserUnicast(ctx context.Context, deviceCode string, wait
ClientIp: c.ClientIP,
})
if err != nil {
if isCapacityError(err.Error()) {
return fmt.Errorf("failed to connect on host %s: %w — this may mean the QA user pubkey is not on the onchain qa-allowlist; verify with 'doublezero global-config qa-allowlist list'", c.Host, err)
}
return fmt.Errorf("failed to connect on host %s: %w", c.Host, err)
}
if !resp.GetSuccess() {
return fmt.Errorf("connection failed on host %s: %s", c.Host, resp.GetOutput())
output := strings.Join(resp.GetOutput(), "\n")
if isCapacityError(output) {
return fmt.Errorf("connection failed on host %s: %s — this may mean the QA user pubkey is not on the onchain qa-allowlist; verify with 'doublezero global-config qa-allowlist list'", c.Host, output)
}
return fmt.Errorf("connection failed on host %s: %s", c.Host, output)
}
c.log.Debug("Unicast user connected", "host", c.Host, "device", deviceCode)

Expand Down Expand Up @@ -282,6 +289,14 @@ type Hop struct {
Raw string
}

func isCapacityError(s string) bool {
return strings.Contains(s, "user limit") ||
strings.Contains(s, "MaxUsersExceeded") ||
strings.Contains(s, "MaxUnicastUsersExceeded") ||
strings.Contains(s, "MaxMulticastPublishersExceeded") ||
strings.Contains(s, "MaxMulticastSubscribersExceeded")
}

func parseMTR(input string) ([]Hop, error) {
re := regexp.MustCompile(`^\s*(\d+)\.\|\-\-\s+(\S+)\s+(\d+(?:\.\d+)?)(?:%)?\s+(\d+)\b`)

Expand Down
74 changes: 5 additions & 69 deletions e2e/internal/qa/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,82 +87,18 @@ func (t *Test) Devices() map[string]*Device {
return t.devices
}

// DeviceUserType identifies which per-type user slot bucket to check against
// a device's capacity. The onchain device tracks three independent counters —
// unicast, multicast publisher, multicast subscriber — each with its own max.
type DeviceUserType int

const (
DeviceUserTypeUnicast DeviceUserType = iota
DeviceUserTypeMulticastPublisher
DeviceUserTypeMulticastSubscriber
)

func (d DeviceUserType) String() string {
switch d {
case DeviceUserTypeUnicast:
return "unicast"
case DeviceUserTypeMulticastPublisher:
return "multicast_publisher"
case DeviceUserTypeMulticastSubscriber:
return "multicast_subscriber"
default:
return fmt.Sprintf("unknown(%d)", int(d))
}
}

// capacityFor returns the (current, max) counters for the requested user type.
func (d *Device) capacityFor(userType DeviceUserType) (current, max int) {
switch userType {
case DeviceUserTypeUnicast:
return d.UnicastUsersCount, d.MaxUnicastUsers
case DeviceUserTypeMulticastPublisher:
return d.MulticastPublishersCount, d.MaxMulticastPublishers
case DeviceUserTypeMulticastSubscriber:
return d.MulticastSubscribersCount, d.MaxMulticastSubscribers
default:
return 0, 0
}
}

// ValidDevices returns devices that pass filtering criteria for the given
// user type. A device is considered valid when it has at least minCapacity
// free slots in the type-specific bucket (e.g. unicast) AND in the aggregate
// users bucket — both are enforced onchain independently.
//
// If skipCapacityCheck is true (e.g., when using a QA identity that bypasses
// on-chain capacity checks), devices are not filtered by available capacity.
func (t *Test) ValidDevices(userType DeviceUserType, minCapacity int, skipCapacityCheck bool) []*Device {
// ValidDevices returns all activated devices except those whose code contains
// "test" (typically not real hardware). Capacity is not checked here — the QA
// user pubkey should be on the onchain qa_allowlist so that the smart contract
// bypasses capacity limits for QA connections.
func (t *Test) ValidDevices() []*Device {
devices := make([]*Device, 0, len(t.devices))

for _, device := range t.Devices() {
// Skip devices with "test" in the code as these are typically not real hardware
if strings.Contains(strings.ToLower(device.Code), "test") {
t.log.Debug("Skipping test device", "device", device.Code)
continue
}

// Skip capacity check if using QA identity (bypasses on-chain max_users check)
if !skipCapacityCheck {
typeCount, typeMax := device.capacityFor(userType)
if typeMax-typeCount < minCapacity {
t.log.Debug("Skipping device with insufficient type-specific capacity",
"device", device.Code,
"userType", userType,
"count", typeCount,
"max", typeMax,
)
continue
}
if device.MaxUsers-device.UsersCount < minCapacity {
t.log.Debug("Skipping device with insufficient aggregate capacity",
"device", device.Code,
"users", device.UsersCount,
"maxUsers", device.MaxUsers,
)
continue
}
}
devices = append(devices, device)
}

Expand Down
81 changes: 70 additions & 11 deletions e2e/qa_alldevices_unicast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import (
)

var (
devicesFlag = flag.String("devices", "", "comma separated list of devices to run tests against")
allocateAddrHosts = flag.String("allocate-addr-hosts", "", "comma separated list of hosts that will have `--allocate-addr` passed to `doublezero connect ibrl`")
skipCapacityCheckFlag = flag.Bool("skip-capacity-check", false, "skip device capacity checks (use when running with QA identity that bypasses on-chain max_users)")
devicesFlag = flag.String("devices", "", "comma separated list of devices to run tests against")
allocateAddrHosts = flag.String("allocate-addr-hosts", "", "comma separated list of hosts that will have `--allocate-addr` passed to `doublezero connect ibrl`")
failureThreshold = flag.Float64("failure-threshold", 0.1, "maximum allowed overall device failure rate (0.0-1.0) before the test is marked as failed")
perHostFailureThreshold = flag.Float64("per-host-failure-threshold", 0.2, "maximum allowed per-host device failure rate (0.0-1.0) before the test is marked as failed")
)

func TestQA_AllDevices_UnicastConnectivity(t *testing.T) {
Expand Down Expand Up @@ -53,11 +54,9 @@ func TestQA_AllDevices_UnicastConnectivity(t *testing.T) {
clients := test.Clients()
require.GreaterOrEqual(t, len(clients), 2, "At least 2 clients are required for connectivity testing")

// Filter devices to only include those with sufficient unicast capacity and skip test devices
// When using a QA identity (--skip-capacity-check), all devices are included regardless of capacity
devices := test.ValidDevices(qa.DeviceUserTypeUnicast, 2, *skipCapacityCheckFlag)
devices := test.ValidDevices()
if len(devices) == 0 {
t.Skip("No valid devices found with sufficient capacity")
t.Skip("No valid devices found")
}

// Filter out transit devices - they don't participate in unicast connectivity tests
Expand Down Expand Up @@ -209,6 +208,65 @@ func TestQA_AllDevices_UnicastConnectivity(t *testing.T) {
}
log.Debug("Test summary", "packetsReceived", totalReceived, "packetsSent", totalSent, "batchesWithLoss", batchesWithLoss, "totalBatches", batchCount)

// Evaluate failure rates against threshold
totalDevices := len(deviceResults)
failedDevices := 0
var failedDeviceCodes []string
for code, result := range deviceResults {
if !result.Success {
failedDevices++
failedDeviceCodes = append(failedDeviceCodes, code)
}
}

overallRate := float64(failedDevices) / float64(totalDevices)
log.Debug("Overall failure rate",
"failed", failedDevices,
"total", totalDevices,
"rate", fmt.Sprintf("%.1f%%", overallRate*100),
"threshold", fmt.Sprintf("%.1f%%", *failureThreshold*100),
)
if overallRate > *failureThreshold {
slices.Sort(failedDeviceCodes)
t.Errorf("Overall device failure rate %.1f%% (%d/%d) exceeds threshold %.1f%%. Failed devices: %s",
overallRate*100, failedDevices, totalDevices, *failureThreshold*100,
strings.Join(failedDeviceCodes, ", "))
}

type hostStats struct {
total int
failed int
failedDevices []string
}
perHost := make(map[string]*hostStats)
for _, batch := range batchData {
for host, assignment := range batch {
if perHost[host] == nil {
perHost[host] = &hostStats{}
}
perHost[host].total++
if !assignment.Success() {
perHost[host].failed++
perHost[host].failedDevices = append(perHost[host].failedDevices, assignment.Device.Code)
}
}
}
for host, stats := range perHost {
hostRate := float64(stats.failed) / float64(stats.total)
log.Debug("Per-host failure rate",
"host", host,
"failed", stats.failed,
"total", stats.total,
"rate", fmt.Sprintf("%.1f%%", hostRate*100),
)
if hostRate > *perHostFailureThreshold {
slices.Sort(stats.failedDevices)
t.Errorf("Host %s failure rate %.1f%% (%d/%d) exceeds threshold %.1f%%. Failed devices: %s",
host, hostRate*100, stats.failed, stats.total, *perHostFailureThreshold*100,
strings.Join(stats.failedDevices, ", "))
}
}

results := make([]qa.DeviceTestResult, 0, len(deviceResults))
for _, result := range deviceResults {
results = append(results, *result)
Expand Down Expand Up @@ -315,7 +373,7 @@ func connectClientsAndWaitForRoutes(
log.Error("Failed to start connection", "client", c.Host, "device", device.Code, "error", err)
batch[c.Host].FailedTests++
if device.Status == serviceability.DeviceStatusActivated && device.MaxUsers > 0 {
t.Errorf("failed to connect client %s to device %s: %v", c.Host, device.Code, err)
t.Logf("DEVICE FAILURE: failed to connect client %s to device %s: %v", c.Host, device.Code, err)
} else {
log.Warn("Ignoring connection failure for device not ready for users", "device", device.Code, "status", device.Status, "maxUsers", device.MaxUsers)
}
Expand All @@ -330,7 +388,7 @@ func connectClientsAndWaitForRoutes(
log.Error("Client failed to reach status up", "client", c.Host, "error", err)
batch[c.Host].FailedTests++
if device.Status == serviceability.DeviceStatusActivated && device.MaxUsers > 0 {
t.Errorf("failed to wait for status for client %s: %v", c.Host, err)
t.Logf("DEVICE FAILURE: failed to wait for status for client %s: %v", c.Host, err)
} else {
log.Warn("Ignoring status failure for device not ready for users", "device", device.Code, "status", device.Status, "maxUsers", device.MaxUsers)
}
Expand Down Expand Up @@ -365,7 +423,7 @@ func connectClientsAndWaitForRoutes(
log.Error("Failed to wait for routes", "client", c.Host, "error", err)
batch[c.Host].FailedTests++
if device.Status == serviceability.DeviceStatusActivated && device.MaxUsers > 0 {
t.Errorf("failed to wait for routes on client %s: %v", c.Host, err)
t.Logf("DEVICE FAILURE: failed to wait for routes on client %s: %v", c.Host, err)
} else {
log.Warn("Ignoring route failure for device not ready for users", "device", device.Code, "status", device.Status, "maxUsers", device.MaxUsers)
}
Expand Down Expand Up @@ -417,7 +475,8 @@ func runConnectivitySubtests(
srcReady := srcDevice.Status == serviceability.DeviceStatusActivated && srcDevice.MaxUsers > 0
dstReady := dstDevice.Status == serviceability.DeviceStatusActivated && dstDevice.MaxUsers > 0
if srcReady && dstReady {
assert.NoError(t, err, "failed to test connectivity")
t.Logf("DEVICE FAILURE: connectivity test failed from %s to %s (device %s -> %s): %v",
src.Host, target.Host, srcDevice.Code, dstDevice.Code, err)
} else {
log.Warn("Ignoring connectivity failure involving device not ready for users",
"sourceDevice", srcDevice.Code, "sourceStatus", srcDevice.Status, "sourceMaxUsers", srcDevice.MaxUsers,
Expand Down
Loading