diff --git a/src/underworld3/cython/petsc_generic_snes_solvers.pyx b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
index 4848c719..ee30c5d1 100644
--- a/src/underworld3/cython/petsc_generic_snes_solvers.pyx
+++ b/src/underworld3/cython/petsc_generic_snes_solvers.pyx
@@ -1008,6 +1008,8 @@ class SolverBaseClass(uw_object):
                 self._stokes_nullspace = None
             if hasattr(self, "_stokes_nullspace_basis"):
                 self._stokes_nullspace_basis = ()
+            if hasattr(self, "_velocity_rotation_nullspace"):
+                self._velocity_rotation_nullspace = None
             if hasattr(self, "_constant_nullspace_obj"):
                 self._constant_nullspace_obj = None
 
@@ -2168,6 +2170,45 @@ class SNES_Scalar(SolverBaseClass):
         cdef DS ds =  self.dm.getDS()
         cdef PtrContainer ext = self.compiled_extensions
 
+        # TODO(BUG): natural BC on an INTERNAL surface is partition-dependent.
+        # An interior facet has two support cells; the FE-assembly DM is kept
+        # non-overlapped on purpose (overlap double-counts volume assembly via
+        # LocalToGlobal+ADD — see discretisation_mesh.py ~L760). At a partition
+        # seam ~few interior facets have only ONE local support cell, and PETSc's
+        # DMPlexComputeBdResidual_Single_Internal attributes the whole per-facet
+        # elemVec to support[0]'s cell closure (plexfem.c ~L4928) with the facet
+        # normal oriented outward from support[0] (dmfieldds.c ~L790). When the
+        # local-only support cell is the OPPOSITE side from the serial support[0],
+        # the integral is the same value but is scattered through a different
+        # cell closure; closure DOFs that are non-owned on this rank are added
+        # locally then dropped at global assembly → the assembled load UNDER-counts
+        # by ~0.027% (the force *function* integral / RMS is identical to machine
+        # precision). In an ill-conditioned solve (augmented Stokes_Constrained)
+        # this amplifies to ~0.1% velocity.
+        #
+        # Three cheap workarounds were TESTED and do NOT work:
+        #   * editing the boundary LABEL (owner-only / ghost-strip): no-op — the
+        #     ghost facet copies already contribute nothing (PETSc integrates each
+        #     facet once), so stripping them is bit-identical;
+        #   * a 1-cell partition OVERLAP on the assembly DM: no-op — verified the
+        #     overlap propagates (+ghost cells) yet F0 is bit-identical, so overlap
+        #     does NOT recover the dropped DOFs (and it double-counts volume anyway);
+        #   * assembling the surface load on a codim-1 SUBMESH (extract_surface):
+        #     blocked — UW3 FE on an embedded surface fails at setup because the
+        #     gradient/flux term has cdim components but is reshaped to dim
+        #     (the manifold vector-FE gap).
+        # A deterministic support[0] cannot be imposed from UW3 (support order
+        # follows the DMPlex point partition). Genuine fixes, all substantial:
+        #   (a) MANUALLY assemble the boundary load (per-facet ∫ f·φ keyed by GLOBAL
+        #       velocity DOF, each facet once) into a partition-independent vector
+        #       and inject it via a guarded SNES function wrap — reimplements the bd
+        #       residual but sidesteps PETSc's support[0] scatter;
+        #   (b) fix UW3 manifold vector-FE, then assemble the load on the surface
+        #       submesh and map to the parent by coincident-node coordinate;
+        #   (c) a PETSc patch making the interior-facet bd residual scatter to the
+        #       OWNED support cell.
+        # See planning file (underworld.md, Bugs) and
+        # project_stokes_constrained_parallel_session.
         for index,bc in enumerate(self.natural_bcs):
 
             components = bc.components
@@ -4532,7 +4573,11 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
         # interior h DOFs directly in the fine local PetscSection
         # (_constrain_interior_multipliers_in_section) — lossless (correct
         # constraint-boundary closure) and refined/FMG-safe (no DMAddBoundary
-        # ordering restriction). Default ON.
+        # ordering restriction). Default ON: it is both a correctness-neutral DOF
+        # reduction AND a conditioning win — leaving the interior h DOFs in place
+        # keeps the near-singular ε-screened interior block in the solved system,
+        # which an iterative ([p,h] Schur) solve handles poorly. See that method's
+        # docstring for why disabling it is ill-conditioned, not "more accurate".
         self._reduce_interior_multiplier = True
 
         self._degree = degree
@@ -4654,6 +4699,7 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
         self._petsc_velocity_nullspace_basis = ()
         self._stokes_nullspace = None
         self._stokes_nullspace_basis = ()
+        self._velocity_rotation_nullspace = None
 
         # Construct strainrate tensor for future usage.
         # Grab gradients, and let's switch out to sympy.Matrix notation
@@ -5295,6 +5341,10 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
     def _reset_stokes_nullspace(self):
         self._stokes_nullspace = None
         self._stokes_nullspace_basis = ()
+        # the cached velocity rotation nullspace is built from node coordinates,
+        # so it must be invalidated whenever the nullspace config / geometry
+        # changes (e.g. mesh deformation) — see _build_velocity_rotation_nullspace.
+        self._velocity_rotation_nullspace = None
 
     def _pressure_dirichlet_bcs(self):
         """Return essential boundary conditions applied to the pressure field."""
@@ -5515,6 +5565,79 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
                 flush=True,
             )
 
+    def _build_velocity_rotation_nullspace(self):
+        """Cache a velocity-rotation-only ``MatNullSpace`` (zero in pressure /
+        multiplier blocks). Used to project the rigid-rotation gauge out of the
+        converged solution — see ``_remove_velocity_rotation_gauge``. Cached;
+        the cache is invalidated by ``_reset_stokes_nullspace`` / solver rebuild
+        because the modes depend on node coordinates."""
+        if self._velocity_rotation_nullspace is not None:
+            return self._velocity_rotation_nullspace
+        if not self._petsc_velocity_nullspace_basis:
+            return None
+
+        vecs = [self._build_velocity_nullspace_vector(m)
+                for m in self._petsc_velocity_nullspace_basis]
+        orthonormal = []
+        for bvec in vecs:
+            for ovec in orthonormal:
+                bvec.axpy(-ovec.dot(bvec), ovec)
+            bnorm = bvec.norm()
+            if np.isclose(bnorm, 0.0):
+                bvec.destroy()
+                continue
+            bvec.scale(1.0 / bnorm)
+            orthonormal.append(bvec)
+        if not orthonormal:
+            return None
+
+        self._velocity_rotation_nullspace = PETSc.NullSpace().create(
+            constant=False, vectors=tuple(orthonormal), comm=self.dm.comm)
+        return self._velocity_rotation_nullspace
+
+    def _remove_velocity_rotation_gauge(self, gvec):
+        """Project the rigid-body rotation gauge out of the converged solution.
+
+        For a free-slip (non-Dirichlet) velocity, the rigid rotations are a true
+        nullspace of the velocity block (A_uu·rotation = 0). The monolithic
+        nullspace attached for the solve is NOT removed inside a fieldsplit/Schur
+        iteration — the inner velocity KSP has no rotation nullspace, so it leaves
+        an unconstrained rigid rotation in the velocity. Because the operator is
+        blind to it, the true residual still converges to machine precision, but
+        the rotation amplitude is PARTITION-DEPENDENT (the tangential velocity then
+        differs serial-vs-parallel by ~0.1-1% while the physical / radial flow is
+        partition-clean to round-off). Since the rotation is a genuine nullspace,
+        removing it from the converged ``gvec`` does NOT change the residual and
+        yields the same rotation-free solution on any decomposition. This is the
+        velocity analogue of the constant-pressure gauge (see set_pressure_gauge).
+        No-op when there are no velocity rotation modes.
+
+        Why a post-solve projection rather than attaching the rotation nullspace
+        to the fieldsplit velocity SUB-BLOCK so the inner KSP removes it in-solve:
+        the in-solve route was implemented and MEASURED, and it does not fix the
+        gauge. A ``DMSetNullSpaceConstructor(dm, 0, ...)`` (rotations-only, built
+        with ``DMPlexCreateRigidBody``) DOES attach the rotation nullspace to the
+        fieldsplit velocity block — verified: the velocity sub-block operator
+        carries the 1 (2-D) / 3 (3-D) rotation modes after PC setup — yet the
+        converged GLOBAL velocity still retained the full rotation gauge (rotation
+        coefficient ~0.18, i.e. unchanged). Removing a nullspace gauge is a
+        projection on the converged GLOBAL solution; attaching the nullspace to a
+        sub-block operator only constrains the inner Krylov solves, and with
+        ``fgmres`` + a variable (fieldsplit/Schur) preconditioner that reintroduces
+        rotation, the assembled outer solution is not gauge-free. (PETSc also
+        dispatches ``DMCreateSubDM`` nullspace constructors by SUB-DM-LOCAL field
+        index, so a field-0 constructor needs a block-sniffing guard to avoid
+        leaking onto the ``[p,h]`` Schur block — but even with that, the global
+        gauge persists.) So the explicit post-solve projection is the correct AND
+        necessary mechanism, not a stopgap: it is mathematically exact (the rotation
+        is a true nullspace, so it does not change the residual) and robust to
+        operator rebuild. Building only the rotation modes (zero in the pressure /
+        multiplier blocks) keeps it from touching the pressure/multiplier gauge,
+        which is handled by the monolithic nullspace."""
+        rot_ns = self._build_velocity_rotation_nullspace()
+        if rot_ns is not None:
+            rot_ns.remove(gvec)
+
 
     ## F0, F1 should be f0 and F1, (pf0 for Saddles can be added here)
     ## don't add new ones uf0, uF1 are redundant
@@ -6224,6 +6347,28 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
         appends the local offset of every h-bearing point regardless of
         constraint, so the copy back into the full-domain ``h`` MeshVariable is
         size-correct without change.
+
+        Why this is lossless (and why disabling it is NOT a "more accurate"
+        reference). The interior ``h`` rows are the screening block alone,
+        ``ε M_ii h_i + ε M_ib h_b = 0`` (no constraint or buoyancy source reaches
+        the interior), so the interior multiplier is fully determined by the
+        boundary trace, ``h_i = -M_ii^{-1} M_ib h_b`` — a *bounded* O(h_b)
+        quantity that feeds back into the boundary row only at O(ε). At ε=1e-6 its
+        effect on the physical solution is negligible: pinning ``h_i = 0`` instead
+        moves a converged solve by ~1e-8 in velocity and ~1e-5 in topography
+        (measured, 2-D annulus). So the pinned DOFs carry no physical signal.
+
+        Disabling the reduction (``_reduce_interior_multiplier = False``) does NOT
+        recover lost physics; it re-admits ~ndof/3 interior DOFs whose only
+        coupling is the near-singular ``ε M_ii`` block, enlarging and
+        ill-conditioning the ``[p,h]`` Schur complement. On a direct/well-converged
+        solve the answer is essentially unchanged (lossless, as above); on an
+        iterative grouped-Schur solve (e.g. ``snes_type=ksponly`` on the 3-D shell)
+        the extra ill-conditioned DOFs degrade convergence and the solve can land
+        on a different representative — which is the origin of the "answer moves a
+        few percent / parallel spread worsens when off" behaviour. That is a
+        conditioning effect, not evidence the knockout drops information. Keep it
+        ON; it is the recommended, validated path.
         """
         from petsc4py import PETSc
         import numpy as np
@@ -7245,6 +7390,13 @@ class SNES_Stokes_SaddlePt(SolverBaseClass):
             self._attach_stokes_nullspace()
             self._snes_solve_with_retries(gvec, divergence_retries, verbose)
 
+        # Project the rigid-body rotation gauge out of the converged solution.
+        # The fieldsplit/Schur inner velocity solve leaves an unconstrained (and
+        # partition-dependent) rigid rotation that the true residual is blind to;
+        # removing this genuine velocity nullspace makes the tangential velocity
+        # partition-independent to round-off without changing the residual.
+        self._remove_velocity_rotation_gauge(gvec)
+
         # SNESSetUpdate fires only at the START of each iteration, so the final
         # converged iterate is otherwise un-hooked. Apply the callbacks once more
         # to it (e.g. so a pressure gauge pins the FINAL pressure, and an
diff --git a/src/underworld3/systems/solvers.py b/src/underworld3/systems/solvers.py
index 081d552f..abc81502 100644
--- a/src/underworld3/systems/solvers.py
+++ b/src/underworld3/systems/solvers.py
@@ -2069,8 +2069,19 @@ class SNES_Stokes_Constrained(SNES_Stokes):
     The constraint is enforced in one coupled solve (no outer iteration). The
     augmented-Lagrangian term conditions the :math:`[p,h]` Schur complement
     without biasing the multiplier, and the interior multiplier DOFs are reduced
-    away so the solved block is boundary-sized. Serial only (the boundary mask is
-    not yet MPI-decomposed). See
+    away so the solved block is boundary-sized.
+
+    Runs in parallel: the interior-multiplier reduction is rank-local section
+    surgery so the global system, velocity solve, and gauge-fixed topography are
+    partition-independent (validated in
+    ``tests/parallel/test_1063_constrained_freeslip_parallel.py``). On an
+    **enclosed** problem (a pressure null space is active) the constant pressure
+    and constant multiplier are gauge-free, and the solver lands on a
+    partition-dependent level for each. To keep the raw **pressure** reproducible
+    across ranks the solver pins the surface-mean pressure automatically (see
+    :attr:`auto_pressure_gauge`); the raw **multiplier** keeps its own gauge level,
+    so read dynamic topography through :meth:`topography` with ``reference="mean"``
+    (gauge-invariant). See
     ``docs/developer/design/CONSTRAINED_FREESLIP_MULTIPLIER.md``.
 
     See Also
@@ -2104,6 +2115,17 @@ def __init__(
         # ordinary Stokes solve, so the base assembly is unaffected.
         self._block_constraint_bcs = []
 
+        # Guarded automatic pressure gauge (see _maybe_install_auto_gauge). On an
+        # enclosed constrained problem the constant pressure and constant
+        # multiplier are gauge-free; the solver lands on a partition-dependent
+        # level for each, so the raw fields are not reproducible across ranks.
+        # When a pressure null space is active we pin the surface-mean pressure,
+        # making the raw PRESSURE reproducible at zero cost to the velocity (the
+        # raw multiplier keeps its own gauge — read topography via reference="mean").
+        # Set to False to opt out.
+        self.auto_pressure_gauge = True
+        self._auto_gauge_callback = None
+
         # Default the grouped [p, lambda] Schur preconditioner to `selfp` (the base
         # Stokes default is `a11`). The constraint Schur complement
         # S_lambda = C A^-1 C^T needs a preconditioner built from the actual operator
@@ -2114,8 +2136,158 @@ def __init__(
         # multiplier (= dynamic topography) clean at small augmentation. Override via
         # `solver.petsc_options["pc_fieldsplit_schur_precondition"] = "a11"` if desired.
         self.petsc_options["pc_fieldsplit_schur_precondition"] = "selfp"
+
+        # Flexible outer Krylov (fgmres) is REQUIRED, not a tuning choice. The
+        # grouped [p, lambda] Schur preconditioner runs inner *iterative* sub-solves
+        # (velocity multigrid + the Schur KSP), so the preconditioner operator
+        # varies from one outer iteration to the next — a variable/nonlinear
+        # preconditioner. A non-flexible Krylov method (the base's gmres) is
+        # invalid for that: the preconditioned residual it minimises false-converges
+        # (it reported CONVERGED_RTOL in ~2 iterations while the TRUE residual
+        # ||r||/||b|| blew up to ~10^3), landing on a wrong, partition-dependent
+        # answer (the ~0.4% serial-vs-parallel velocity spread, and the failure to
+        # reproduce the Zhong response). fgmres handles the variable preconditioner
+        # correctly: the true residual converges and the solve is
+        # partition-independent. Override via petsc_options["ksp_type"] if needed.
+        self.petsc_options["ksp_type"] = "fgmres"
+
+        # Judge outer convergence on the TRUE (unpreconditioned) residual. With a
+        # variable preconditioner the preconditioned-residual norm is not a fixed
+        # inner product and false-converges (it can report convergence while the
+        # true residual is still large), so the default preconditioned-norm test
+        # would stop the solve early even with a tight rtol — re-introducing the
+        # partition-dependent velocity. The unpreconditioned norm costs one extra
+        # matvec per iteration and makes the stopping test honest.
+        self.petsc_options["ksp_norm_type"] = "unpreconditioned"
+
+        # Bound the outer iteration count. The well-set-up solve converges in a
+        # handful of outer iterations (~6 on the 3-D shell at cellSize 1/8); this
+        # generous cap just guarantees a pathological case fails loudly
+        # (DIVERGED_ITS) rather than silently grinding. Raise it for genuinely
+        # hard problems via petsc_options["ksp_max_it"].
+        self.petsc_options["ksp_max_it"] = 100
+
+        # Tighten the Eisenstat-Walker adaptive linear tolerance (the base saddle
+        # point enables snes_ksp_ew with the PETSc default initial/max rtol 0.3).
+        # EW's loose-early heuristic is right for a NONLINEAR Newton solve, but the
+        # constrained solver is normally run linear (snes_type=ksponly), where EW
+        # then caps the single outer solve at rtol 0.3 — one outer iteration, true
+        # residual ~1e-5 relative. The augmented saddle point is ill-conditioned
+        # (augmentation ~1e4), so that loose residual is amplified into a ~0.1%
+        # partition-dependent velocity (GAMG's per-partition aggregation makes the
+        # preconditioner partition-dependent, which only cancels once the TRUE
+        # residual is driven down). Pinning EW's initial = max rtol to the solver
+        # tolerance makes the outer fgmres iterate until genuinely converged, so
+        # the velocity is partition-independent to round-off. Kept in sync by the
+        # `tolerance` setter below.
+        self.petsc_options["snes_ksp_ew_rtol0"] = self._tolerance * 1.0e-1
+        self.petsc_options["snes_ksp_ew_rtolmax"] = self._tolerance * 1.0e-1
         return
 
+    @property
+    def tolerance(self):
+        """Solver tolerance (see :class:`SNES_Stokes_SaddlePt.tolerance`).
+
+        Overridden so that, in addition to ``snes_rtol`` / ``ksp_rtol`` /
+        ``ksp_atol``, the Eisenstat-Walker initial and max relative tolerances are
+        pinned to ``tolerance * 0.1`` — otherwise EW's default (0.3) under-solves
+        the ill-conditioned augmented constrained system on a linear solve and the
+        velocity becomes partition-dependent (see ``__init__``).
+        """
+        return self._tolerance
+
+    @tolerance.setter
+    def tolerance(self, value):
+        self._tolerance = value
+        self.petsc_options["snes_rtol"] = value
+        self.petsc_options["ksp_rtol"] = value * 1.0e-1
+        self.petsc_options["ksp_atol"] = value * 1.0e-6
+        self.petsc_options["snes_ksp_ew_rtol0"] = value * 1.0e-1
+        self.petsc_options["snes_ksp_ew_rtolmax"] = value * 1.0e-1
+
+    def solve(self, *args, **kwargs):
+        """Solve the constrained Stokes system (see :meth:`SNES_Stokes.solve`).
+
+        Installs the guarded automatic pressure gauge (if eligible) before
+        delegating to the base solve, so the raw pressure and multiplier are
+        partition-reproducible by construction on enclosed problems.
+        """
+        self._warn_if_monolithic_direct()
+        self._maybe_install_auto_gauge()
+        return super().solve(*args, **kwargs)
+
+    def _warn_if_monolithic_direct(self):
+        """Warn that a monolithic direct solve of the constrained system is a
+        serial diagnostic only.
+
+        Setting ``pc_type = lu``/``cholesky`` factorises the whole
+        :math:`[\\,u, p, h\\,]` saddle point as one block. On the constrained
+        system this is a KNOWN-BAD path: it does not reproduce the validated
+        grouped-Schur (``selfp``) velocity response (e.g. surface velocity ~4e-3
+        vs the ~1e-2 Zhong reference on the 3-D shell) AND it segfaults in
+        parallel (the indefinite KKT factorisation is not robust here). The
+        grouped :math:`\\mathbf{u}\\,|\\,[p,h]` field-split is the supported path;
+        use ``pc_type = lu`` only as a serial cross-check, knowing the response is
+        not the benchmark reference.
+        """
+        if not self._block_constraint_bcs:
+            return
+        try:
+            pc_type = self.petsc_options["pc_type"]
+        except KeyError:
+            return
+        if pc_type in ("lu", "cholesky"):
+            import warnings
+            warnings.warn(
+                "Stokes_Constrained: a monolithic direct solve "
+                f"(pc_type='{pc_type}') of the constrained saddle point is a "
+                "serial diagnostic only — it does not reproduce the validated "
+                "grouped-Schur velocity response and segfaults in parallel. Use "
+                "the default field-split (u | [p,h]) path for production runs.",
+                stacklevel=2,
+            )
+
+    def _maybe_install_auto_gauge(self):
+        """Pin the (p, h) gauge consistently on an enclosed constrained problem.
+
+        Conservative — installs ``set_pressure_gauge`` on the first constraint
+        boundary (``reference=0``) only when ALL of the following hold, otherwise
+        it is a no-op and the solve path is unchanged:
+
+        * ``auto_pressure_gauge`` is left True (the default),
+        * we have not already installed it (idempotent across re-solves),
+        * at least one multiplier constraint is registered,
+        * a pressure null space is active (``petsc_use_nullspace`` / enclosed) —
+          this is the gauge-freedom flag; with it off the pressure is determined
+          and must not be shifted,
+        * no pressure Dirichlet BC already pins the gauge,
+        * the user has registered no update callback of their own (e.g. an
+          explicit ``set_pressure_gauge`` or a smoother) — we never clobber it.
+
+        Pinning the surface-mean pressure makes the raw **pressure** field
+        partition-reproducible (measured: the enclosed-shell mean pressure goes
+        from a ~10% serial-vs-np8 spread to the assembly floor, ~0.4%). It is
+        physics-neutral: the velocity is bit-identical with and without the pin.
+        It does NOT fix the raw **multiplier** ``h`` — the constant multiplier is
+        an independent gauge freedom the pressure pin does not touch — so dynamic
+        topography must still be read with ``topography(..., reference="mean")``,
+        which is gauge-invariant by construction.
+        """
+        if not self.auto_pressure_gauge:
+            return
+        if self._auto_gauge_callback is not None:
+            return
+        if not self._block_constraint_bcs:
+            return
+        if not self._petsc_use_pressure_nullspace:
+            return
+        if self._pressure_dirichlet_bcs():
+            return
+        if self._snes_update_callbacks:
+            return
+        boundary = self._block_constraint_bcs[0].boundary
+        self._auto_gauge_callback = self.set_pressure_gauge(boundary, 0.0)
+
     @property
     def saddle_preconditioner(self):
         """Not used by ``Stokes_Constrained`` — the Schur preconditioner is built
@@ -2194,10 +2366,12 @@ def add_constraint_bc(self, boundary, g=0.0, normal=None, screening=None,
         # Validated bit-identical at np=1/2/4 (velocity L2 and mean-stripped
         # boundary topography) in
         # tests/parallel/test_1063_constrained_freeslip_parallel.py.
-        # NOTE: on enclosed problems the raw multiplier h carries the [p,λ] gauge
-        # constant, of which the solver lands on a partition-dependent
-        # representative — strip its boundary mean for a reproducible topography
-        # (`topography(..., reference="mean")`).
+        # NOTE: on enclosed problems the constant pressure and constant multiplier
+        # are gauge-free and land on a partition-dependent level. The automatic
+        # pressure gauge (auto_pressure_gauge, see _maybe_install_auto_gauge) pins
+        # the raw PRESSURE reproducibly, but the raw multiplier h keeps its own
+        # gauge level — read dynamic topography via topography(..., reference="mean"),
+        # which is gauge-invariant and partition-reproducible by construction.
 
         if not hasattr(self.mesh.boundaries, boundary):
             raise ValueError(
@@ -2284,6 +2458,13 @@ def topography(self, boundary, buoyancy_scale=1.0, reference=None):
         **no** gauge freedom (e.g. an open boundary), where the mean of :math:`h` is
         the physical mean traction and must NOT be removed.
 
+        Note that the automatic pressure gauge (:attr:`auto_pressure_gauge`) fixes
+        the raw *pressure* level but NOT the raw *multiplier* level (the constant
+        multiplier is an independent gauge freedom). So on an enclosed problem the
+        raw multiplier (``reference=None``) is still partition-dependent —
+        ``reference="mean"`` is the gauge-invariant, partition-reproducible read
+        for dynamic topography and is the recommended path.
+
         Parameters
         ----------
         boundary : str
diff --git a/tests/parallel/test_1063_constrained_freeslip_parallel.py b/tests/parallel/test_1063_constrained_freeslip_parallel.py
index 789e1845..0d1c75b7 100644
--- a/tests/parallel/test_1063_constrained_freeslip_parallel.py
+++ b/tests/parallel/test_1063_constrained_freeslip_parallel.py
@@ -76,6 +76,76 @@ def _solve_diagnostics(kind):
     return L2, topo
 
 
+# Serial (np=1) reference for the gauge-reproducibility test below. The enclosed
+# iso problem with an ACTIVE pressure null space exercises the automatic pressure
+# gauge (auto_pressure_gauge, default on): the constant pressure and constant
+# multiplier are both gauge-free, so without a pin the solver lands on a
+# partition-dependent level for each. The auto gauge pins the raw PRESSURE
+# reproducibly (the raw multiplier keeps its own gauge freedom — topography is
+# read gauge-invariantly via reference="mean"). Velocity is physics-neutral.
+# (velocity L2, raw meanP, MEAN-STRIPPED boundary topography).
+# Recompute with `python <thisfile> gauge`.
+GOLDEN_GAUGE = (6.194547487092e-01, -3.847657609797e-10, 3.781793823254e+01)
+
+
+def _solve_gauge_diagnostics():
+    """Enclosed constrained annulus with an active pressure null space (so the
+    automatic pressure gauge fires); return the gauge-relevant diagnostics."""
+    mesh = uw.meshing.Annulus(radiusOuter=1.0, radiusInner=0.5,
+                              cellSize=0.12, qdegree=4)
+    v = uw.discretisation.MeshVariable("Ug", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable("Pg", mesh, 1, degree=1)
+    X = mesh.CoordinateSystem.X
+    unit_r = mesh.CoordinateSystem.unit_e_0
+
+    st = uw.systems.Stokes_Constrained(mesh, velocityField=v, pressureField=p)
+    st.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    st.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+    st.tolerance = 1.0e-10
+    st.petsc_use_nullspace = True            # enclosed -> pressure null space active
+    st.add_essential_bc((0.0, 0.0), "Lower")  # kill the velocity rotation null space
+    st.add_constraint_bc("Upper", g=0.0, normal=unit_r)
+    st.bodyforce = 1.0e2 * sympy.sin(3 * sympy.atan2(X[1], X[0])) * unit_r
+    st.solve(zero_init_guess=True)
+    assert st._auto_gauge_callback is not None, "auto pressure gauge should have fired"
+
+    L2 = float(np.sqrt(uw.maths.Integral(mesh, v.sym.dot(v.sym)).evaluate()))
+    vol = float(uw.maths.Integral(mesh, sympy.Integer(1)).evaluate())
+    # RAW mean pressure — pinned to ~0 by the auto gauge, partition-reproducible.
+    meanP = float(uw.maths.Integral(mesh, p.sym[0]).evaluate()) / vol
+    # MEAN-STRIPPED topography (reference="mean") — gauge-invariant, the supported
+    # partition-reproducible read of the multiplier.
+    topo_fn = st.topography("Upper", reference="mean")
+    topoL2 = float(np.sqrt(uw.maths.BdIntegral(
+        mesh=mesh, fn=topo_fn ** 2, boundary="Upper").evaluate()))
+    return L2, meanP, topoL2
+
+
+def test_constrained_raw_gauge_partition_independent():
+    """With the automatic pressure gauge on (default), the RAW mean pressure is
+    partition-independent (pinned to ~0), the velocity stays bit-identical (the
+    gauge is physics-neutral), and the mean-stripped topography is reproducible.
+    The raw multiplier level is NOT asserted reproducible — it keeps an
+    independent gauge freedom the pressure pin does not touch (use
+    reference="mean")."""
+    L2, meanP, topo = _solve_gauge_diagnostics()
+    L2_ref, meanP_ref, topo_ref = GOLDEN_GAUGE
+    # Velocity is physics-neutral under the gauge; the residual ~1e-9 spread is
+    # parallel round-off in the pressure-null-space projection (this enclosed
+    # config carries the null space, unlike the half-pinned case above), well
+    # below any real partition effect.
+    assert np.isclose(L2, L2_ref, rtol=1e-8, atol=0), (
+        f"velocity L2 differs serial vs np={uw.mpi.size}: {L2_ref} vs {L2}")
+    # meanP is pinned to ~0 on the gauge boundary; compare on an absolute scale
+    # (a relative tolerance is meaningless near machine zero).
+    assert np.isclose(meanP, meanP_ref, rtol=0, atol=1e-6), (
+        f"raw mean pressure differs serial vs np={uw.mpi.size}: "
+        f"{meanP_ref} vs {meanP}")
+    assert np.isclose(topo, topo_ref, rtol=1e-6, atol=0), (
+        f"mean-stripped topography differs serial vs np={uw.mpi.size}: "
+        f"{topo_ref} vs {topo}")
+
+
 @pytest.mark.parametrize("kind", ["iso", "ti"])
 def test_constrained_freeslip_partition_independent(kind):
     """The parallel solve must reproduce the serial reference: velocity bit-
@@ -91,9 +161,14 @@ def test_constrained_freeslip_partition_independent(kind):
 
 
 if __name__ == "__main__":
-    # Recompute the serial GOLDEN reference: `python <thisfile> {iso,ti}`.
+    # Recompute the serial GOLDEN reference: `python <thisfile> {iso,ti,gauge}`.
     import sys
     _kind = sys.argv[1] if len(sys.argv) > 1 else "iso"
-    _L2, _topo = _solve_diagnostics(_kind)
-    if uw.mpi.rank == 0:
-        print(f"DIAG {_L2:.12e} {_topo:.12e}")
+    if _kind == "gauge":
+        _L2, _meanP, _topo = _solve_gauge_diagnostics()
+        if uw.mpi.rank == 0:
+            print(f"DIAG_GAUGE {_L2:.12e} {_meanP:.12e} {_topo:.12e}")
+    else:
+        _L2, _topo = _solve_diagnostics(_kind)
+        if uw.mpi.rank == 0:
+            print(f"DIAG {_L2:.12e} {_topo:.12e}")
diff --git a/tests/test_1064_constrained_spherical_shell_response.py b/tests/test_1064_constrained_spherical_shell_response.py
index 56acda5a..72af086b 100644
--- a/tests/test_1064_constrained_spherical_shell_response.py
+++ b/tests/test_1064_constrained_spherical_shell_response.py
@@ -6,9 +6,23 @@
   scale for this low-resolution response case;
 * a direct-LU diagnostic path gives matching Nitsche and constrained responses,
   but it does not reproduce the validated Nitsche/default response and should
-  not be treated as the benchmark reference;
-* the practical fast grouped-Schur constrained path currently does not reproduce
-  the Zhong velocity response and remains an expected failure.
+  not be treated as the benchmark reference. A monolithic direct factorisation
+  of the *constrained* saddle point is a SERIAL diagnostic only: it gives the
+  wrong velocity response and segfaults in parallel (the indefinite KKT
+  factorisation is not robust here), so ``Stokes_Constrained`` now emits a
+  warning when ``pc_type`` is ``lu``/``cholesky``. Use the grouped
+  ``u | [p,h]`` field-split for production;
+* the DEFAULT constrained field-split path now reproduces the Zhong velocity
+  response: ``Stokes_Constrained`` defaults to a flexible outer Krylov (fgmres)
+  with an unpreconditioned (true-residual) convergence test and a tightened
+  Eisenstat-Walker tolerance — required because the grouped ``u | [p,h]`` Schur
+  preconditioner uses inner *iterative* sub-solves (a variable preconditioner),
+  which a non-flexible method false-converged on, giving a wrong, partition-
+  dependent answer;
+* two *alternative* sub-solver choices remain expected failures for this load:
+  the fast grouped-Schur variant with a ``gasm`` Schur block, and the
+  direct-LU-sub-solve field-split (LU on the indefinite KKT blocks is not robust
+  and segfaults in parallel).
 
 Run:
     pixi run -e amr-dev pytest -q tests/test_1064_constrained_spherical_shell_response.py
@@ -176,6 +190,24 @@ def test_default_nitsche_matches_zhong_velocity_response():
     assert abs(cmb_velocity - ZHONG_CMB_VELOCITY) / ZHONG_CMB_VELOCITY < 0.05
 
 
+def test_default_constrained_matches_zhong_velocity_response():
+    """The DEFAULT constrained field-split path reproduces the Zhong response.
+
+    This is the headline of the fgmres + unpreconditioned-norm + tightened-EW
+    solver defaults: the grouped u | [p,h] Schur solve now genuinely converges
+    (it previously false-converged on the variable preconditioner) and matches
+    the validated Nitsche/default velocity scale to within the benchmark tol.
+    """
+    surface_velocity, cmb_velocity, snes_reason = solve_response(
+        "constrained",
+        "default",
+    )
+
+    assert snes_reason > 0
+    assert abs(surface_velocity - ZHONG_SURFACE_VELOCITY) / ZHONG_SURFACE_VELOCITY < 0.05
+    assert abs(cmb_velocity - ZHONG_CMB_VELOCITY) / ZHONG_CMB_VELOCITY < 0.05
+
+
 def test_direct_lu_diagnostic_constrained_matches_direct_lu_diagnostic_nitsche():
     nitsche_surface, nitsche_cmb, nitsche_reason = solve_response(
         "nitsche",
diff --git a/tests/test_1065_rotation_gauge_freeslip.py b/tests/test_1065_rotation_gauge_freeslip.py
new file mode 100644
index 00000000..a7ac3706
--- /dev/null
+++ b/tests/test_1065_rotation_gauge_freeslip.py
@@ -0,0 +1,79 @@
+"""Regression test for the free-slip velocity rigid-rotation gauge fix.
+
+On a fully free-slip (no Dirichlet velocity BC) domain the rigid-body rotations
+are a true nullspace of the velocity block (A_uu·rotation = 0). PETSc's
+fieldsplit/Schur inner velocity solve has no rotation nullspace attached, so it
+leaves an UNCONSTRAINED rigid rotation in the converged velocity. The operator is
+blind to it (the true residual still converges to machine precision), but the
+rotation amplitude is partition-dependent — so the tangential velocity differs
+serial-vs-parallel even though the physical (radial) flow does not.
+
+``SNES_Stokes_SaddlePt.solve`` projects this rotation gauge out of the converged
+solution (``_remove_velocity_rotation_gauge``). This test pins that behaviour: on
+a fully free-slip annulus driven by a PURELY RADIAL body force (which does zero
+work against a rigid rotation, so the physical solution carries no net rotation),
+the converged velocity's projection onto the rigid-rotation mode must be ~0.
+
+Without the fix the solve leaves a non-trivial rotation component (~1e-3 of the
+flow); with the fix it is at round-off. Serial, fast — no MPI required.
+"""
+
+import numpy as np
+import sympy
+import pytest
+
+import underworld3 as uw
+
+pytestmark = [pytest.mark.timeout(120)]
+
+
+def _freeslip_rotation_coefficient():
+    """Solve a fully free-slip annulus with a radial body force and return the
+    normalised projection of the velocity onto the rigid-rotation mode
+    t(x,y) = (-y, x): |<v, t>| / (|v| |t|).  ~0 iff the rotation gauge is removed."""
+    mesh = uw.meshing.Annulus(radiusOuter=1.0, radiusInner=0.5,
+                              cellSize=0.12, qdegree=4)
+    v = uw.discretisation.MeshVariable("U", mesh, 2, degree=2)
+    p = uw.discretisation.MeshVariable("P", mesh, 1, degree=1)
+    X = mesh.CoordinateSystem.X
+    unit_r = mesh.CoordinateSystem.unit_e_0
+
+    st = uw.systems.Stokes_Constrained(mesh, velocityField=v, pressureField=p)
+    st.constitutive_model = uw.constitutive_models.ViscousFlowModel
+    st.constitutive_model.Parameters.shear_viscosity_0 = 1.0
+    st.tolerance = 1.0e-10
+    st.petsc_use_nullspace = True            # build the rotation (+ pressure) nullspace
+    # Fully free-slip: BOTH boundaries are constraint BCs, NO essential velocity
+    # BC -> the rigid-rotation nullspace is ACTIVE (the case the fix targets).
+    st.add_constraint_bc("Upper", g=0.0, normal=unit_r)
+    st.add_constraint_bc("Lower", g=0.0, normal=unit_r)
+    # purely radial drive: does zero work against a rigid rotation, so the
+    # physical solution has no net rotation component.
+    st.bodyforce = 1.0e2 * sympy.sin(3 * sympy.atan2(X[1], X[0])) * unit_r
+    st.solve(zero_init_guess=True)
+
+    # rigid-rotation field t = (-y, x); project the converged velocity onto it.
+    rot = sympy.Matrix([-X[1], X[0]])
+    v_dot_rot = float(uw.maths.Integral(mesh, v.sym.dot(rot)).evaluate())
+    rot_dot_rot = float(uw.maths.Integral(mesh, rot.dot(rot)).evaluate())
+    v_dot_v = float(uw.maths.Integral(mesh, v.sym.dot(v.sym)).evaluate())
+    # normalised rotation coefficient: |<v,t>| / (|v| |t|)
+    return abs(v_dot_rot) / (np.sqrt(v_dot_v) * np.sqrt(rot_dot_rot))
+
+
+@pytest.mark.tier_a
+@pytest.mark.level_1
+def test_freeslip_velocity_has_no_rotation_gauge():
+    """The converged free-slip velocity must carry no rigid-rotation component:
+    the solver projects the rotation gauge out. Guards
+    ``_remove_velocity_rotation_gauge`` — without it this coefficient is ~1e-3."""
+    coeff = _freeslip_rotation_coefficient()
+    assert coeff < 1.0e-8, (
+        "free-slip velocity retains a rigid-rotation gauge component "
+        f"(|<v,rot>|/(|v||rot|) = {coeff:.3e}); the post-solve rotation "
+        "projection (_remove_velocity_rotation_gauge) did not run / is ineffective"
+    )
+
+
+if __name__ == "__main__":
+    print(f"rotation coefficient = {_freeslip_rotation_coefficient():.6e}")