From 5ac12bff06c920b4c355cc4177ff034b71ee3367 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Wed, 17 Jun 2026 13:46:51 +0000 Subject: [PATCH 1/2] arm64: Add float/double cast optimisation tests --- .../JIT/opt/SVE/PredicateInstructions.cs | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs index 787e4c00a50f88..5e27e94a475f89 100644 --- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -11,6 +11,9 @@ public class PredicateInstructions { + private static readonly float[] s_floatValues = new float[64]; + private static readonly double[] s_doubleValues = new double[64]; + [MethodImpl(MethodImplOptions.NoInlining)] [Fact] public static void TestPredicateInstructions() @@ -40,6 +43,17 @@ public static void TestPredicateInstructions() UnzipEvenZipLowMask(vecs, vecs); TransposeEvenAndMask(vecs, vecs, vecs); + PredicateCastFloatLoad(s_floatValues, 0, s_floatValues.Length); + PredicateCastFloatLocalLoad(s_floatValues, 0, s_floatValues.Length); + PointerCastFloatLoad(s_floatValues, 0, s_floatValues.Length); + WhileLessThanSingleFloatLoad(s_floatValues, 0, s_floatValues.Length); + PredicateCastFloatLoop(s_floatValues, s_floatValues, s_floatValues.Length); + + PredicateCastDoubleLoad(s_doubleValues, 0, s_doubleValues.Length); + PredicateCastDoubleLocalLoad(s_doubleValues, 0, s_doubleValues.Length); + PointerCastDoubleLoad(s_doubleValues, 0, s_doubleValues.Length); + WhileLessThanDoubleLoad(s_doubleValues, 0, s_doubleValues.Length); + PredicateCastDoubleLoop(s_doubleValues, s_doubleValues, s_doubleValues.Length); } } @@ -179,4 +193,172 @@ static Vector TransposeEvenAndMask(Vector v, Vector a, Vect Sve.And(Sve.CompareGreaterThan(a, b), Sve.CompareEqual(a, b)), Sve.CompareLessThan(a, b))); } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector PredicateCastFloatLoad(float[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (float* ptr = values) + { + Vector mask = Sve.CreateWhileLessThanMaskUInt32(index, length); + return Sve.LoadVector((Vector)mask, ptr + index); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector PointerCastFloatLoad(float[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (float* ptr = values) + { + Vector mask = Sve.CreateWhileLessThanMaskUInt32(index, length); + return (Vector)Sve.LoadVector(mask, (uint*)(ptr + index)); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector PredicateCastFloatLocalLoad(float[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (float* ptr = values) + { + Vector uintMask = Sve.CreateWhileLessThanMaskUInt32(index, length); + Vector floatMask = (Vector)uintMask; + return Sve.LoadVector(floatMask, ptr + index); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector WhileLessThanSingleFloatLoad(float[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (float* ptr = values) + { + Vector mask = Sve.CreateWhileLessThanMaskSingle(index, length); + return Sve.LoadVector(mask, ptr + index); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe void PredicateCastFloatLoop(float[] input, float[] output, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: st1w { {{z[0-9]+}}.s }, {{p[0-9]+}}, [{{x[0-9]+}}] + fixed (float* inputPtr = input, outputPtr = output) + { + int i = 0; + int count = (int)Sve.Count32BitElements(); + + while (i < length) + { + Vector loopMask = Sve.CreateWhileLessThanMaskUInt32(i, length); + Vector floatMask = (Vector)loopMask; + Vector value = Sve.LoadVector(floatMask, inputPtr + i); + Sve.StoreAndZip(floatMask, outputPtr + i, value); + + i += count; + } + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector PredicateCastDoubleLoad(double[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (double* ptr = values) + { + Vector mask = Sve.CreateWhileLessThanMaskUInt64(index, length); + return Sve.LoadVector((Vector)mask, ptr + index); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector PointerCastDoubleLoad(double[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (double* ptr = values) + { + Vector mask = Sve.CreateWhileLessThanMaskUInt64(index, length); + return (Vector)Sve.LoadVector(mask, (ulong*)(ptr + index)); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector PredicateCastDoubleLocalLoad(double[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (double* ptr = values) + { + Vector ulongMask = Sve.CreateWhileLessThanMaskUInt64(index, length); + Vector doubleMask = (Vector)ulongMask; + return Sve.LoadVector(doubleMask, ptr + index); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe Vector WhileLessThanDoubleLoad(double[] values, int index, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + fixed (double* ptr = values) + { + Vector mask = Sve.CreateWhileLessThanMaskDouble(index, length); + return Sve.LoadVector(mask, ptr + index); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static unsafe void PredicateCastDoubleLoop(double[] input, double[] output, int length) + { + //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} + //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] + //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE: st1d { {{z[0-9]+}}.d }, {{p[0-9]+}}, [{{x[0-9]+}}] + fixed (double* inputPtr = input, outputPtr = output) + { + int i = 0; + int count = (int)Sve.Count64BitElements(); + + while (i < length) + { + Vector loopMask = Sve.CreateWhileLessThanMaskUInt64(i, length); + Vector doubleMask = (Vector)loopMask; + Vector value = Sve.LoadVector(doubleMask, inputPtr + i); + Sve.StoreAndZip(doubleMask, outputPtr + i, value); + + i += count; + } + } + } } From 1c415a094d14d5dafd3ed3adb1d61ccf0dca8a1a Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Wed, 24 Jun 2026 08:28:17 +0000 Subject: [PATCH 2/2] Check for any cmp instruction in SVE predicate tests Change-Id: I03f182f358444c8eb2f7775a62454c38c881bed4 --- .../JIT/opt/SVE/PredicateInstructions.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs index 5e27e94a475f89..03f82eda1b0bce 100644 --- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -199,7 +199,7 @@ static unsafe Vector PredicateCastFloatLoad(float[] values, int index, in { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (float* ptr = values) { @@ -213,7 +213,7 @@ static unsafe Vector PointerCastFloatLoad(float[] values, int index, int { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (float* ptr = values) { @@ -227,7 +227,7 @@ static unsafe Vector PredicateCastFloatLocalLoad(float[] values, int inde { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (float* ptr = values) { @@ -242,7 +242,7 @@ static unsafe Vector WhileLessThanSingleFloatLoad(float[] values, int ind { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (float* ptr = values) { @@ -256,10 +256,10 @@ static unsafe void PredicateCastFloatLoop(float[] input, float[] output, int len { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.s, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [{{x[0-9]+}}] //ARM64-NOT: mov {{z[0-9]+}}.s, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: st1w { {{z[0-9]+}}.s }, {{p[0-9]+}}, [{{x[0-9]+}}] fixed (float* inputPtr = input, outputPtr = output) { @@ -283,7 +283,7 @@ static unsafe Vector PredicateCastDoubleLoad(double[] values, int index, { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (double* ptr = values) { @@ -297,7 +297,7 @@ static unsafe Vector PointerCastDoubleLoad(double[] values, int index, i { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (double* ptr = values) { @@ -311,7 +311,7 @@ static unsafe Vector PredicateCastDoubleLocalLoad(double[] values, int i { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (double* ptr = values) { @@ -326,7 +326,7 @@ static unsafe Vector WhileLessThanDoubleLoad(double[] values, int index, { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] fixed (double* ptr = values) { @@ -340,10 +340,10 @@ static unsafe void PredicateCastDoubleLoop(double[] input, double[] output, int { //ARM64-FULL-LINE: whilelt {{p[0-9]+}}.d, {{w[0-9]+}}, {{w[0-9]+}} //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: ld1d { {{z[0-9]+}}.d }, {{p[0-9]+}}/z, [{{x[0-9]+}}] //ARM64-NOT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 - //ARM64-NOT: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-NOT: {{^ *}}cmp //ARM64-FULL-LINE: st1d { {{z[0-9]+}}.d }, {{p[0-9]+}}, [{{x[0-9]+}}] fixed (double* inputPtr = input, outputPtr = output) {