Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 3 additions & 27 deletions src/implementation/aarch64/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,33 +86,8 @@ impl SimdU8Value {
}

#[inline]
#[flexpect::e(clippy::too_many_arguments)]
unsafe fn lookup_16(
self,
v0: u8,
v1: u8,
v2: u8,
v3: u8,
v4: u8,
v5: u8,
v6: u8,
v7: u8,
v8: u8,
v9: u8,
v10: u8,
v11: u8,
v12: u8,
v13: u8,
v14: u8,
v15: u8,
) -> Self {
Self::from(vqtbl1q_u8(
Self::repeat_16(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
)
.0,
self.0,
))
unsafe fn lookup_16(self, tbl: Self) -> Self {
Self::from(vqtbl1q_u8(tbl.0, self.0))
}

#[inline]
Expand Down Expand Up @@ -210,3 +185,4 @@ const PREFETCH: bool = false;
use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
simd_input_128_bit!();
algorithm_simd!();
algorithm_simd_default_special_case_fns!();
102 changes: 66 additions & 36 deletions src/implementation/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,18 @@ macro_rules! algorithm_simd {

$(#[$feat])*
#[inline]
unsafe fn check_special_cases(input: SimdU8Value, prev1: SimdU8Value) -> SimdU8Value {
unsafe fn has_error(&self) -> bool {
self.error.any_bit_set()
}

/// Lookup tables used by `check_special_cases`, returns
/// `(byte_1_high, byte_1_low, byte_2_high)`
///
/// In optimized builds the returned values compile to constants, which are
/// directly loaded into SIMD registers.
$(#[$feat])*
#[inline]
unsafe fn special_case_tables() -> (SimdU8Value, SimdU8Value, SimdU8Value) {
const TOO_SHORT: u8 = 1 << 0;
const TOO_LONG: u8 = 1 << 1;
const OVERLONG_3: u8 = 1 << 2;
Expand All @@ -76,7 +87,7 @@ macro_rules! algorithm_simd {
const OVERLONG_4: u8 = 1 << 6;
const CARRY: u8 = TOO_SHORT | TOO_LONG | TWO_CONTS;

let byte_1_high = prev1.shr4().lookup_16(
let byte_1_high = SimdU8Value::repeat_16(
TOO_LONG,
TOO_LONG,
TOO_LONG,
Expand All @@ -95,7 +106,7 @@ macro_rules! algorithm_simd {
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4,
);

let byte_1_low = prev1.and(SimdU8Value::splat(0x0F)).lookup_16(
let byte_1_low = SimdU8Value::repeat_16(
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
CARRY | OVERLONG_2,
CARRY,
Expand All @@ -114,7 +125,7 @@ macro_rules! algorithm_simd {
CARRY | TOO_LARGE | TOO_LARGE_1000,
);

let byte_2_high = input.shr4().lookup_16(
let byte_2_high = SimdU8Value::repeat_16(
TOO_SHORT,
TOO_SHORT,
TOO_SHORT,
Expand All @@ -133,45 +144,15 @@ macro_rules! algorithm_simd {
TOO_SHORT,
);

byte_1_high.and(byte_1_low).and(byte_2_high)
}

$(#[$feat])*
#[inline]
unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0xe0 - 0x80));
let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0xf0 - 0x80));
is_third_byte.or(is_fourth_byte)
}

$(#[$feat])*
#[inline]
unsafe fn check_multibyte_lengths(
input: SimdU8Value,
prev: SimdU8Value,
special_cases: SimdU8Value,
) -> SimdU8Value {
let prev2 = input.prev2(prev);
let prev3 = input.prev3(prev);
let must23 = Self::must_be_2_3_continuation(prev2, prev3);
let must23_80 = must23.and(SimdU8Value::splat(0x80));
must23_80.xor(special_cases)
}

$(#[$feat])*
#[inline]
unsafe fn has_error(&self) -> bool {
self.error.any_bit_set()
(byte_1_high, byte_1_low, byte_2_high)
}

$(#[$feat])*
#[inline]
unsafe fn check_bytes(&mut self, input: SimdU8Value) {
let prev1 = input.prev1(self.prev);
let sc = Self::check_special_cases(input, prev1);
self.error = self
.error
.or(Self::check_multibyte_lengths(input, self.prev, sc));
self.error = Self::check_multibyte_lengths(input, self.prev, sc, self.error);
self.prev = input;
}

Expand Down Expand Up @@ -511,6 +492,55 @@ macro_rules! algorithm_simd {
};
}

/// Default implementations of `check_special_cases` and `check_multibyte_lengths`
/// (and its `must_be_2_3_continuation` helper).
///
/// Every architecture except AVX-512 invokes this macro. AVX-512 provides its own
/// micro-optimized versions.
macro_rules! algorithm_simd_default_special_case_fns {
($(#[$feat:meta])*) => {
impl Utf8CheckAlgorithm<SimdU8Value> {
$(#[$feat])*
#[inline]
unsafe fn check_special_cases(input: SimdU8Value, prev1: SimdU8Value) -> SimdU8Value {
let (byte_1_high_table, byte_1_low_table, byte_2_high_table) =
Self::special_case_tables();

let byte_1_high = prev1.shr4().lookup_16(byte_1_high_table);
let byte_1_low = prev1.and(SimdU8Value::splat(0x0F)).lookup_16(byte_1_low_table);
let byte_2_high = input.shr4().lookup_16(byte_2_high_table);

byte_1_high.and(byte_1_low).and(byte_2_high)
}

$(#[$feat])*
#[inline]
unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0xe0 - 0x80));
let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0xf0 - 0x80));
is_third_byte.or(is_fourth_byte)
}

// Slightly different from the original algorithm. The error is or-ed in here to allow
// AVX-512 ternlog optimization.
$(#[$feat])*
#[inline]
unsafe fn check_multibyte_lengths(
input: SimdU8Value,
prev: SimdU8Value,
special_cases: SimdU8Value,
error: SimdU8Value,
) -> SimdU8Value {
let prev2 = input.prev2(prev);
let prev3 = input.prev3(prev);
let must23 = Self::must_be_2_3_continuation(prev2, prev3);
let must23_80 = must23.and(SimdU8Value::splat(0x80));
error.or(must23_80.xor(special_cases))
}
}
};
}

macro_rules! simd_input_128_bit {
($(#[$feat:meta])*) => {
#[repr(C)]
Expand Down
28 changes: 3 additions & 25 deletions src/implementation/armv7/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,31 +122,8 @@ impl SimdU8Value {

#[inline]
#[target_feature(enable = "neon")]
#[flexpect::e(clippy::too_many_arguments)]
unsafe fn lookup_16(
self,
v0: u8,
v1: u8,
v2: u8,
v3: u8,
v4: u8,
v5: u8,
v6: u8,
v7: u8,
v8: u8,
v9: u8,
v10: u8,
v11: u8,
v12: u8,
v13: u8,
v14: u8,
v15: u8,
) -> Self {
let rep = Self::repeat_16(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
)
.0;
Self(vqtbl1q_u8(rep, self.0))
unsafe fn lookup_16(self, tbl: Self) -> Self {
Self(vqtbl1q_u8(tbl.0, self.0))
}

#[inline]
Expand Down Expand Up @@ -244,3 +221,4 @@ const PREFETCH: bool = false;
use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
simd_input_128_bit!(#[target_feature(enable = "neon")]);
algorithm_simd!(#[target_feature(enable = "neon")]);
algorithm_simd_default_special_case_fns!(#[target_feature(enable = "neon")]);
26 changes: 3 additions & 23 deletions src/implementation/portable/simd128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,32 +80,11 @@ impl SimdU8Value {
}

#[inline]
fn lookup_16(
self,
v0: u8,
v1: u8,
v2: u8,
v3: u8,
v4: u8,
v5: u8,
v6: u8,
v7: u8,
v8: u8,
v9: u8,
v10: u8,
v11: u8,
v12: u8,
v13: u8,
v14: u8,
v15: u8,
) -> Self {
fn lookup_16(self, tbl: Self) -> Self {
// We need to ensure that 'self' only contains the lower 4 bits, unlike the avx instruction
// this will otherwise lead to bad results
let idx: u8x16 = self.0;
let src: u8x16 = Self::repeat_16(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
)
.0;
let src: u8x16 = tbl.0;
let res = src.swizzle_dyn(idx);
Self::from(res)
}
Expand Down Expand Up @@ -221,3 +200,4 @@ const PREFETCH: bool = false;
use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
simd_input_128_bit!();
algorithm_simd!();
algorithm_simd_default_special_case_fns!();
27 changes: 3 additions & 24 deletions src/implementation/portable/simd256.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,33 +82,11 @@ impl SimdU8Value {
}

#[inline]
fn lookup_16(
self,
v0: u8,
v1: u8,
v2: u8,
v3: u8,
v4: u8,
v5: u8,
v6: u8,
v7: u8,
v8: u8,
v9: u8,
v10: u8,
v11: u8,
v12: u8,
v13: u8,
v14: u8,
v15: u8,
) -> Self {
fn lookup_16(self, tbl: Self) -> Self {
// We need to ensure that 'self' only contains the lower 4 bits, unlike the avx instruction
// this will otherwise lead to bad results
let idx: u8x32 = self.0.cast();
let src: u8x32 = Self::repeat_16(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
)
.0
.cast();
let src: u8x32 = tbl.0.cast();
let res = src.swizzle_dyn(idx);
Self::from(res.cast())
}
Expand Down Expand Up @@ -214,3 +192,4 @@ const PREFETCH: bool = false;
use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
simd_input_256_bit!();
algorithm_simd!();
algorithm_simd_default_special_case_fns!();
30 changes: 3 additions & 27 deletions src/implementation/wasm32/simd128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,33 +91,8 @@ impl SimdU8Value {
}

#[inline]
#[flexpect::e(clippy::too_many_arguments)]
unsafe fn lookup_16(
self,
v0: u8,
v1: u8,
v2: u8,
v3: u8,
v4: u8,
v5: u8,
v6: u8,
v7: u8,
v8: u8,
v9: u8,
v10: u8,
v11: u8,
v12: u8,
v13: u8,
v14: u8,
v15: u8,
) -> Self {
Self::from(u8x16_swizzle(
Self::repeat_16(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
)
.0,
self.0,
))
unsafe fn lookup_16(self, tbl: Self) -> Self {
Self::from(u8x16_swizzle(tbl.0, self.0))
}

#[inline]
Expand Down Expand Up @@ -255,3 +230,4 @@ const PREFETCH: bool = false;
use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
simd_input_128_bit!(#[target_feature(enable = "simd128")]);
algorithm_simd!(#[target_feature(enable = "simd128")]);
algorithm_simd_default_special_case_fns!(#[target_feature(enable = "simd128")]);
30 changes: 3 additions & 27 deletions src/implementation/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,35 +102,10 @@ impl SimdU8Value {
Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>()))
}

#[flexpect::e(clippy::too_many_arguments)]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn lookup_16(
self,
v0: u8,
v1: u8,
v2: u8,
v3: u8,
v4: u8,
v5: u8,
v6: u8,
v7: u8,
v8: u8,
v9: u8,
v10: u8,
v11: u8,
v12: u8,
v13: u8,
v14: u8,
v15: u8,
) -> Self {
Self::from(_mm256_shuffle_epi8(
Self::repeat_16(
v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
)
.0,
self.0,
))
unsafe fn lookup_16(self, tbl: Self) -> Self {
Self::from(_mm256_shuffle_epi8(tbl.0, self.0))
}

#[flexpect::e(clippy::cast_possible_wrap)]
Expand Down Expand Up @@ -240,3 +215,4 @@ const PREFETCH: bool = true;
use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
simd_input_256_bit!(#[target_feature(enable = "avx2")]);
algorithm_simd!(#[target_feature(enable = "avx2")]);
algorithm_simd_default_special_case_fns!(#[target_feature(enable = "avx2")]);
Loading
Loading