diff --git a/CHANGELOG.md b/CHANGELOG.md index 75e0f25ffc..c25cddc766 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file. ## Unreleased +- e2e/qa: remove client-side capacity pre-filtering from `ValidDevices`, because the QA user pubkey bypasses capacity limits using the serviceability global-config qa-allowlist. Individual device failures no longer fail the test; instead, overall and per-host failure rates are evaluated after all batches and the test only fails if either exceeds `--failure-threshold` (default 10%) or `--per-host-failure-threshold` (default 20%). + ## [v0.24.0](https://github.com/malbeclabs/doublezero/compare/client/v0.23.0...client/v0.24.0) - 2026-05-22 ### Breaking ### Changes + - Smartcontract - Deprecate the 13 contributor-side program instructions whose only client was the now-deleted activator: `ActivateDevice` (21), `RejectDevice` (22), `CloseAccountDevice` (27), `ActivateLink` (29), `RejectLink` (30), `CloseAccountLink` (35), `ActivateMulticastGroup` (47), `RejectMulticastGroup` (48), `DeactivateMulticastGroup` (53), `ActivateDeviceInterface` (72), `RemoveDeviceInterface` (75), `UnlinkDeviceInterface` (77), and `RejectDeviceInterface` (78). Dispatch arms now short-circuit to `DoubleZeroError::Deprecated` (custom code 67); processor files and argument structs are removed. Borsh variant tags are preserved (unit variants) so the wire format is unchanged — old clients receive a deterministic deprecation error rather than an unknown-instruction decode failure. Bumps `MIN_COMPATIBLE_VERSION` to `0.15.0` (the `client/v0.14.1` git tag was a patch release built from a commit whose workspace Cargo version was still `0.14.0`, so the v0.14.1 binary self-reports as 0.14.0 in its startup version check; v0.15.0 is the first release whose embedded version actually satisfies the intended ≥ 0.14.1 gate). Gated on onchain `ProgramConfig.min_compatible_version ≥ 0.15.0` ([#3623](https://github.com/malbeclabs/doublezero/issues/3623)) - Deprecate the `ActivateUser`, `RejectUser`, `CloseAccountUser`, and `BanUser` user-lifecycle program instructions: dispatch arms now return `DoubleZeroError::Deprecated` (custom code 67), and the processor files / argument structs are removed. Borsh variant tags 37/38/43/45 are preserved so the wire format is unchanged. The activator was the only client of all four — `CreateUser` has been atomic-to-`Activated` since RFC-11, `closeaccount` was activator-driven only, and `RequestBanUser` is now atomic. Gated on onchain `min_compatible_version ≥ 0.12.0` ([#3622](https://github.com/malbeclabs/doublezero/issues/3622)) diff --git a/e2e/internal/qa/client_unicast.go b/e2e/internal/qa/client_unicast.go index 5c206f8d19..e746f97b7b 100644 --- a/e2e/internal/qa/client_unicast.go +++ b/e2e/internal/qa/client_unicast.go @@ -65,10 +65,17 @@ func (c *Client) ConnectUserUnicast(ctx context.Context, deviceCode string, wait ClientIp: c.ClientIP, }) if err != nil { + if isCapacityError(err.Error()) { + return fmt.Errorf("failed to connect on host %s: %w — this may mean the QA user pubkey is not on the onchain qa-allowlist; verify with 'doublezero global-config qa-allowlist list'", c.Host, err) + } return fmt.Errorf("failed to connect on host %s: %w", c.Host, err) } if !resp.GetSuccess() { - return fmt.Errorf("connection failed on host %s: %s", c.Host, resp.GetOutput()) + output := strings.Join(resp.GetOutput(), "\n") + if isCapacityError(output) { + return fmt.Errorf("connection failed on host %s: %s — this may mean the QA user pubkey is not on the onchain qa-allowlist; verify with 'doublezero global-config qa-allowlist list'", c.Host, output) + } + return fmt.Errorf("connection failed on host %s: %s", c.Host, output) } c.log.Debug("Unicast user connected", "host", c.Host, "device", deviceCode) @@ -282,6 +289,14 @@ type Hop struct { Raw string } +func isCapacityError(s string) bool { + return strings.Contains(s, "user limit") || + strings.Contains(s, "MaxUsersExceeded") || + strings.Contains(s, "MaxUnicastUsersExceeded") || + strings.Contains(s, "MaxMulticastPublishersExceeded") || + strings.Contains(s, "MaxMulticastSubscribersExceeded") +} + func parseMTR(input string) ([]Hop, error) { re := regexp.MustCompile(`^\s*(\d+)\.\|\-\-\s+(\S+)\s+(\d+(?:\.\d+)?)(?:%)?\s+(\d+)\b`) diff --git a/e2e/internal/qa/test.go b/e2e/internal/qa/test.go index d3c3c0a8df..54e1de78dc 100644 --- a/e2e/internal/qa/test.go +++ b/e2e/internal/qa/test.go @@ -87,82 +87,18 @@ func (t *Test) Devices() map[string]*Device { return t.devices } -// DeviceUserType identifies which per-type user slot bucket to check against -// a device's capacity. The onchain device tracks three independent counters — -// unicast, multicast publisher, multicast subscriber — each with its own max. -type DeviceUserType int - -const ( - DeviceUserTypeUnicast DeviceUserType = iota - DeviceUserTypeMulticastPublisher - DeviceUserTypeMulticastSubscriber -) - -func (d DeviceUserType) String() string { - switch d { - case DeviceUserTypeUnicast: - return "unicast" - case DeviceUserTypeMulticastPublisher: - return "multicast_publisher" - case DeviceUserTypeMulticastSubscriber: - return "multicast_subscriber" - default: - return fmt.Sprintf("unknown(%d)", int(d)) - } -} - -// capacityFor returns the (current, max) counters for the requested user type. -func (d *Device) capacityFor(userType DeviceUserType) (current, max int) { - switch userType { - case DeviceUserTypeUnicast: - return d.UnicastUsersCount, d.MaxUnicastUsers - case DeviceUserTypeMulticastPublisher: - return d.MulticastPublishersCount, d.MaxMulticastPublishers - case DeviceUserTypeMulticastSubscriber: - return d.MulticastSubscribersCount, d.MaxMulticastSubscribers - default: - return 0, 0 - } -} - -// ValidDevices returns devices that pass filtering criteria for the given -// user type. A device is considered valid when it has at least minCapacity -// free slots in the type-specific bucket (e.g. unicast) AND in the aggregate -// users bucket — both are enforced onchain independently. -// -// If skipCapacityCheck is true (e.g., when using a QA identity that bypasses -// on-chain capacity checks), devices are not filtered by available capacity. -func (t *Test) ValidDevices(userType DeviceUserType, minCapacity int, skipCapacityCheck bool) []*Device { +// ValidDevices returns all activated devices except those whose code contains +// "test" (typically not real hardware). Capacity is not checked here — the QA +// user pubkey should be on the onchain qa_allowlist so that the smart contract +// bypasses capacity limits for QA connections. +func (t *Test) ValidDevices() []*Device { devices := make([]*Device, 0, len(t.devices)) for _, device := range t.Devices() { - // Skip devices with "test" in the code as these are typically not real hardware if strings.Contains(strings.ToLower(device.Code), "test") { t.log.Debug("Skipping test device", "device", device.Code) continue } - - // Skip capacity check if using QA identity (bypasses on-chain max_users check) - if !skipCapacityCheck { - typeCount, typeMax := device.capacityFor(userType) - if typeMax-typeCount < minCapacity { - t.log.Debug("Skipping device with insufficient type-specific capacity", - "device", device.Code, - "userType", userType, - "count", typeCount, - "max", typeMax, - ) - continue - } - if device.MaxUsers-device.UsersCount < minCapacity { - t.log.Debug("Skipping device with insufficient aggregate capacity", - "device", device.Code, - "users", device.UsersCount, - "maxUsers", device.MaxUsers, - ) - continue - } - } devices = append(devices, device) } diff --git a/e2e/qa_alldevices_unicast_test.go b/e2e/qa_alldevices_unicast_test.go index de15f548c1..95ebb8a39f 100644 --- a/e2e/qa_alldevices_unicast_test.go +++ b/e2e/qa_alldevices_unicast_test.go @@ -22,9 +22,10 @@ import ( ) var ( - devicesFlag = flag.String("devices", "", "comma separated list of devices to run tests against") - allocateAddrHosts = flag.String("allocate-addr-hosts", "", "comma separated list of hosts that will have `--allocate-addr` passed to `doublezero connect ibrl`") - skipCapacityCheckFlag = flag.Bool("skip-capacity-check", false, "skip device capacity checks (use when running with QA identity that bypasses on-chain max_users)") + devicesFlag = flag.String("devices", "", "comma separated list of devices to run tests against") + allocateAddrHosts = flag.String("allocate-addr-hosts", "", "comma separated list of hosts that will have `--allocate-addr` passed to `doublezero connect ibrl`") + failureThreshold = flag.Float64("failure-threshold", 0.1, "maximum allowed overall device failure rate (0.0-1.0) before the test is marked as failed") + perHostFailureThreshold = flag.Float64("per-host-failure-threshold", 0.2, "maximum allowed per-host device failure rate (0.0-1.0) before the test is marked as failed") ) func TestQA_AllDevices_UnicastConnectivity(t *testing.T) { @@ -53,11 +54,9 @@ func TestQA_AllDevices_UnicastConnectivity(t *testing.T) { clients := test.Clients() require.GreaterOrEqual(t, len(clients), 2, "At least 2 clients are required for connectivity testing") - // Filter devices to only include those with sufficient unicast capacity and skip test devices - // When using a QA identity (--skip-capacity-check), all devices are included regardless of capacity - devices := test.ValidDevices(qa.DeviceUserTypeUnicast, 2, *skipCapacityCheckFlag) + devices := test.ValidDevices() if len(devices) == 0 { - t.Skip("No valid devices found with sufficient capacity") + t.Skip("No valid devices found") } // Filter out transit devices - they don't participate in unicast connectivity tests @@ -209,6 +208,65 @@ func TestQA_AllDevices_UnicastConnectivity(t *testing.T) { } log.Debug("Test summary", "packetsReceived", totalReceived, "packetsSent", totalSent, "batchesWithLoss", batchesWithLoss, "totalBatches", batchCount) + // Evaluate failure rates against threshold + totalDevices := len(deviceResults) + failedDevices := 0 + var failedDeviceCodes []string + for code, result := range deviceResults { + if !result.Success { + failedDevices++ + failedDeviceCodes = append(failedDeviceCodes, code) + } + } + + overallRate := float64(failedDevices) / float64(totalDevices) + log.Debug("Overall failure rate", + "failed", failedDevices, + "total", totalDevices, + "rate", fmt.Sprintf("%.1f%%", overallRate*100), + "threshold", fmt.Sprintf("%.1f%%", *failureThreshold*100), + ) + if overallRate > *failureThreshold { + slices.Sort(failedDeviceCodes) + t.Errorf("Overall device failure rate %.1f%% (%d/%d) exceeds threshold %.1f%%. Failed devices: %s", + overallRate*100, failedDevices, totalDevices, *failureThreshold*100, + strings.Join(failedDeviceCodes, ", ")) + } + + type hostStats struct { + total int + failed int + failedDevices []string + } + perHost := make(map[string]*hostStats) + for _, batch := range batchData { + for host, assignment := range batch { + if perHost[host] == nil { + perHost[host] = &hostStats{} + } + perHost[host].total++ + if !assignment.Success() { + perHost[host].failed++ + perHost[host].failedDevices = append(perHost[host].failedDevices, assignment.Device.Code) + } + } + } + for host, stats := range perHost { + hostRate := float64(stats.failed) / float64(stats.total) + log.Debug("Per-host failure rate", + "host", host, + "failed", stats.failed, + "total", stats.total, + "rate", fmt.Sprintf("%.1f%%", hostRate*100), + ) + if hostRate > *perHostFailureThreshold { + slices.Sort(stats.failedDevices) + t.Errorf("Host %s failure rate %.1f%% (%d/%d) exceeds threshold %.1f%%. Failed devices: %s", + host, hostRate*100, stats.failed, stats.total, *perHostFailureThreshold*100, + strings.Join(stats.failedDevices, ", ")) + } + } + results := make([]qa.DeviceTestResult, 0, len(deviceResults)) for _, result := range deviceResults { results = append(results, *result) @@ -315,7 +373,7 @@ func connectClientsAndWaitForRoutes( log.Error("Failed to start connection", "client", c.Host, "device", device.Code, "error", err) batch[c.Host].FailedTests++ if device.Status == serviceability.DeviceStatusActivated && device.MaxUsers > 0 { - t.Errorf("failed to connect client %s to device %s: %v", c.Host, device.Code, err) + t.Logf("DEVICE FAILURE: failed to connect client %s to device %s: %v", c.Host, device.Code, err) } else { log.Warn("Ignoring connection failure for device not ready for users", "device", device.Code, "status", device.Status, "maxUsers", device.MaxUsers) } @@ -330,7 +388,7 @@ func connectClientsAndWaitForRoutes( log.Error("Client failed to reach status up", "client", c.Host, "error", err) batch[c.Host].FailedTests++ if device.Status == serviceability.DeviceStatusActivated && device.MaxUsers > 0 { - t.Errorf("failed to wait for status for client %s: %v", c.Host, err) + t.Logf("DEVICE FAILURE: failed to wait for status for client %s: %v", c.Host, err) } else { log.Warn("Ignoring status failure for device not ready for users", "device", device.Code, "status", device.Status, "maxUsers", device.MaxUsers) } @@ -365,7 +423,7 @@ func connectClientsAndWaitForRoutes( log.Error("Failed to wait for routes", "client", c.Host, "error", err) batch[c.Host].FailedTests++ if device.Status == serviceability.DeviceStatusActivated && device.MaxUsers > 0 { - t.Errorf("failed to wait for routes on client %s: %v", c.Host, err) + t.Logf("DEVICE FAILURE: failed to wait for routes on client %s: %v", c.Host, err) } else { log.Warn("Ignoring route failure for device not ready for users", "device", device.Code, "status", device.Status, "maxUsers", device.MaxUsers) } @@ -417,7 +475,8 @@ func runConnectivitySubtests( srcReady := srcDevice.Status == serviceability.DeviceStatusActivated && srcDevice.MaxUsers > 0 dstReady := dstDevice.Status == serviceability.DeviceStatusActivated && dstDevice.MaxUsers > 0 if srcReady && dstReady { - assert.NoError(t, err, "failed to test connectivity") + t.Logf("DEVICE FAILURE: connectivity test failed from %s to %s (device %s -> %s): %v", + src.Host, target.Host, srcDevice.Code, dstDevice.Code, err) } else { log.Warn("Ignoring connectivity failure involving device not ready for users", "sourceDevice", srcDevice.Code, "sourceStatus", srcDevice.Status, "sourceMaxUsers", srcDevice.MaxUsers,