From c1605f9faab99ee12d10277422f5186f4b36ac66 Mon Sep 17 00:00:00 2001 From: Brady Date: Sun, 14 Jun 2026 03:23:01 -0500 Subject: [PATCH 01/37] Initial Neon implementation using sse2neon --- CMakeLists.txt | 16 ++++- include/libhat/scanner.hpp | 1 + include/libhat/system.hpp | 5 +- src/Scanner.cpp | 7 +- src/arch/arm/Neon.cpp | 136 +++++++++++++++++++++++++++++++++++++ src/arch/arm/System.cpp | 8 ++- test/tests/Scanner.cpp | 8 +++ 7 files changed, 177 insertions(+), 4 deletions(-) create mode 100644 src/arch/arm/Neon.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b667fe5..ca229c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,11 +68,25 @@ set(LIBHAT_SRC src/arch/x86/AVX512.cpp src/arch/x86/System.cpp - src/arch/arm/System.cpp) + src/arch/arm/Neon.cpp + src/arch/arm/System.cpp +) add_library(libhat STATIC ${LIBHAT_SRC}) add_library(libhat::libhat ALIAS libhat) +include(FetchContent) +FetchContent_Declare( + sse2neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon + GIT_TAG 92f6de174717aef09033ad21568d5bb9e5470404 # v1.9.1 +) +FetchContent_MakeAvailable(sse2neon) +if (MSVC) + set_source_files_properties(src/arch/arm/Neon.cpp PROPERTIES COMPILE_FLAGS "/Zc:preprocessor") +endif() +target_include_directories(libhat PRIVATE ${sse2neon_SOURCE_DIR}) + if(UNIX) set_target_properties(libhat PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() diff --git a/include/libhat/scanner.hpp b/include/libhat/scanner.hpp index 1a53668..3d62343 100644 --- a/include/libhat/scanner.hpp +++ b/include/libhat/scanner.hpp @@ -149,6 +149,7 @@ namespace hat::detail { SSE, // x86/x64 SSE 4.1 AVX2, // x86/x64 AVX2 AVX512, // x64 AVX512 + Neon, // ARMv7+ Neon }; class scan_context { diff --git a/include/libhat/system.hpp b/include/libhat/system.hpp index 80c14e8..f13a6cc 100644 --- a/include/libhat/system.hpp +++ b/include/libhat/system.hpp @@ -52,8 +52,11 @@ LIBHAT_EXPORT namespace hat { LIBHAT_EXPORT namespace hat { struct system_info_arm : hat::system_info { + struct { + bool neon; + } extensions{}; private: - system_info_arm() = default; + system_info_arm(); friend const system_info_arm& get_system(); static const system_info_arm instance; }; diff --git a/src/Scanner.cpp b/src/Scanner.cpp index fb3e112..04917ba 100644 --- a/src/Scanner.cpp +++ b/src/Scanner.cpp @@ -66,8 +66,8 @@ namespace hat::detail { template<> scan_function_t resolve_scanner(scan_context& context) { -#if defined(LIBHAT_X86) || defined(LIBHAT_X86_64) const auto& ext = get_system().extensions; +#if defined(LIBHAT_X86) || defined(LIBHAT_X86_64) if (ext.bmi) { #if defined(LIBHAT_X86_64) && !defined(LIBHAT_DISABLE_AVX512) if (ext.avx512f && ext.avx512bw) { @@ -83,6 +83,11 @@ namespace hat::detail { return resolve_scanner(context); } #endif +#endif +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) + if (ext.neon) { + return resolve_scanner(context); + } #endif // If none of the vectorized implementations are available/supported, then fallback to scanning per-byte return resolve_scanner(context); diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp new file mode 100644 index 0000000..0968c69 --- /dev/null +++ b/src/arch/arm/Neon.cpp @@ -0,0 +1,136 @@ +#include + +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) + +#include + +#include + +#ifdef _MSC_VER + namespace hat::detail { + inline unsigned long bsf(unsigned long num) noexcept { + unsigned long offset; + _BitScanForward(&offset, num); + return offset; + } + } + +#define LIBHAT_BSF32(num) hat::detail::bsf(num) +#else +#define LIBHAT_BSF32(num) __builtin_ctz(num) +#endif + +namespace hat::detail { + + inline void load_signature_128(const signature_view signature, __m128i& bytes, __m128i& mask) { + std::byte byteBuffer[16]{}; // The remaining signature bytes + std::byte maskBuffer[16]{}; // A bitmask for the signature bytes we care about + for (size_t i = 0; i < signature.size(); i++) { + byteBuffer[i] = signature[i].value(); + maskBuffer[i] = signature[i].mask(); + } + bytes = _mm_loadu_si128(reinterpret_cast<__m128i*>(&byteBuffer)); + mask = _mm_loadu_si128(reinterpret_cast<__m128i*>(&maskBuffer)); + } + + template + const_scan_result find_pattern_neon(const std::byte* begin, const std::byte* end, const scan_context& context) { + const auto signature = context.signature; + const auto cmpIndex = cmpeq2 ? *context.pairIndex : context.cmpIndex; + + // 128 bit vector containing first signature byte repeated + const auto firstByte = _mm_set1_epi8(static_cast(*signature[cmpIndex])); + + __m128i secondByte; + if constexpr (cmpeq2) { + secondByte = _mm_set1_epi8(static_cast(*signature[cmpIndex + 1])); + } + + __m128i signatureBytes, signatureMask; + if constexpr (veccmp) { + load_signature_128(signature, signatureBytes, signatureMask); + } + + auto [pre, vec, post] = segment_scan<__m128i, veccmp>(begin, end, signature.size(), cmpIndex); + + if (!pre.empty()) { + const auto result = find_pattern_single(pre.data(), pre.data() + pre.size(), context); + if (result.has_result()) { + return result; + } + } + + for (auto& it : vec) { + const auto cmp = _mm_cmpeq_epi8(firstByte, _mm_load_si128(&it)); + auto mask = static_cast(_mm_movemask_epi8(cmp)); + + if constexpr (cmpeq2) { + const auto cmp2 = _mm_cmpeq_epi8(secondByte, _mm_load_si128(&it)); + auto mask2 = static_cast(_mm_movemask_epi8(cmp2)); + mask &= (mask2 >> 1) | (0b1u << 15); + } + + if constexpr (alignment != scan_alignment::X1) { + mask &= std::rotl(create_alignment_mask(), static_cast(cmpIndex)); + } + + while (mask) { + const auto offset = LIBHAT_BSF32(mask); + const auto i = reinterpret_cast(&it) + offset - cmpIndex; + if constexpr (veccmp) { + const auto data = _mm_loadu_si128(reinterpret_cast(i)); + const auto neqBits = _mm_xor_si128(data, signatureBytes); + const auto match = _mm_testz_si128(neqBits, signatureMask); + if (match) LIBHAT_UNLIKELY { + return i; + } + } else { + const auto match = std::equal(signature.begin(), signature.end(), i); + if (match) LIBHAT_UNLIKELY { + return i; + } + } + mask &= (mask - 1); + } + } + + if (!post.empty()) { + return find_pattern_single(post.data(), post.data() + post.size(), context); + } + return {}; + } + + template<> + scan_function_t resolve_scanner(scan_context& context) { + context.apply_hints({.vectorSize = 16}); + + const auto alignment = context.alignment; + const auto signature = context.signature; + const bool cmpeq2 = context.pairIndex.has_value(); + const bool veccmp = signature.size() <= 16; + + if (alignment == scan_alignment::X1) { + if (cmpeq2 && veccmp) { + return &find_pattern_neon; + } else if (cmpeq2) { + return &find_pattern_neon; + } else if (veccmp) { + return &find_pattern_neon; + } else { + return &find_pattern_neon; + } + } else if (alignment == scan_alignment::X16) { + if (cmpeq2 && veccmp) { + return &find_pattern_neon; + } else if (cmpeq2) { + return &find_pattern_neon; + } else if (veccmp) { + return &find_pattern_neon; + } else { + return &find_pattern_neon; + } + } + LIBHAT_UNREACHABLE(); + } +} +#endif diff --git a/src/arch/arm/System.cpp b/src/arch/arm/System.cpp index ad76b8f..6c263dd 100644 --- a/src/arch/arm/System.cpp +++ b/src/arch/arm/System.cpp @@ -1,9 +1,15 @@ #include -#ifdef LIBHAT_ARM +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) #include namespace hat { +#ifdef LIBHAT_WINDOWS + system_info_arm::system_info_arm() { + this->extensions.neon = true; + } +#endif + } #endif diff --git a/test/tests/Scanner.cpp b/test/tests/Scanner.cpp index 80f26e1..95c71b9 100644 --- a/test/tests/Scanner.cpp +++ b/test/tests/Scanner.cpp @@ -83,6 +83,14 @@ using FindPatternTestTypes = ::testing::Types< FindPatternParameters, FindPatternParameters, FindPatternParameters, +#endif +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, #endif FindPatternParameters, FindPatternParameters, From 39f9201235c41f328a40cb17a5911c05314704d6 Mon Sep 17 00:00:00 2001 From: Brady Date: Sun, 14 Jun 2026 03:27:39 -0500 Subject: [PATCH 02/37] Ignore sse2neon warnings --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca229c4..77fef70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,7 @@ FetchContent_MakeAvailable(sse2neon) if (MSVC) set_source_files_properties(src/arch/arm/Neon.cpp PROPERTIES COMPILE_FLAGS "/Zc:preprocessor") endif() -target_include_directories(libhat PRIVATE ${sse2neon_SOURCE_DIR}) +target_include_directories(libhat SYSTEM PRIVATE ${sse2neon_SOURCE_DIR}) if(UNIX) set_target_properties(libhat PROPERTIES POSITION_INDEPENDENT_CODE ON) From 8df8dfcecb3058818fdbf6424866ed2ebd7ac1e0 Mon Sep 17 00:00:00 2001 From: Brady Date: Sun, 14 Jun 2026 03:30:24 -0500 Subject: [PATCH 03/37] Missing Mode name --- test/tests/Scanner.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/tests/Scanner.cpp b/test/tests/Scanner.cpp index 95c71b9..483b2d6 100644 --- a/test/tests/Scanner.cpp +++ b/test/tests/Scanner.cpp @@ -114,6 +114,7 @@ class FindPatternTestNameGenerator { else if constexpr (Mode == hat::detail::scan_mode::SSE) return "SSE"; else if constexpr (Mode == hat::detail::scan_mode::AVX2) return "AVX2"; else if constexpr (Mode == hat::detail::scan_mode::AVX512) return "AVX512"; + else if constexpr (Mode == hat::detail::scan_mode::Neon) return "Neon"; else static_assert(sizeof(Mode) == 0); } }; From 17ae035ca3635f7b5e37940c3840856c57501d78 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 00:15:11 -0500 Subject: [PATCH 04/37] Implementation throughput comparison benchmark --- .github/workflows/benchmark.yml | 31 ++++++++++++++++++++++ test/CMakeLists.txt | 1 + test/benchmark/CompareImpl.cpp | 47 +++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 .github/workflows/benchmark.yml create mode 100644 test/benchmark/CompareImpl.cpp diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..287a889 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,31 @@ +name: Benchmark + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + windows: + runs-on: windows-11-vs2026-arm + steps: + - uses: actions/checkout@v6 + + - name: Locate Visual Studio + run: | + $vsInstall = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath + echo "VS_INSTALL_DIR=$vsInstall" >> $env:GITHUB_ENV + + - name: Configure + run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -DLIBHAT_TESTING_SAMPLE_BIN=OFF -A ARM64 -T v145 + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 4 --config Release + + - name: Test + working-directory: ${{github.workspace}}/build + shell: cmd + run: | + call "${{env.VS_INSTALL_DIR}}\VC\Auxiliary\Build\vcvarsarm64.bat" + ctest --verbose -C Release -R libhat_benchmark_compare_impl diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fda2f3a..9ef35d7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -59,6 +59,7 @@ function(register_test NAME SOURCE) endfunction() register_test(libhat_benchmark_compare benchmark/Compare.cpp) +register_test(libhat_benchmark_compare_impl benchmark/CompareImpl.cpp) register_test(libhat_test_scanner tests/Scanner.cpp) register_test(libhat_test_process tests/Process.cpp) diff --git a/test/benchmark/CompareImpl.cpp b/test/benchmark/CompareImpl.cpp new file mode 100644 index 0000000..b49a863 --- /dev/null +++ b/test/benchmark/CompareImpl.cpp @@ -0,0 +1,47 @@ +#include + +#include +#include + +static constexpr std::string_view test_pattern = "01 02 03 04 05 06 07 08 09"; + +static auto gen_random_buffer(const size_t size) { + std::vector buffer(size); + std::default_random_engine generator(123); + std::uniform_int_distribution distribution(0, 0xFFFFFFFFFFFFFFFF); + for (size_t i = 0; i < buffer.size(); i += 8) { + uint64_t value = distribution(generator); + std::memcpy(&buffer[i], &value, sizeof(value)); + } + return buffer; +} + +template +static void BM_Throughput(benchmark::State& state) { + const size_t size = state.range(0); + const auto buf = gen_random_buffer(size); + const auto begin = std::to_address(buf.begin()); + const auto end = std::to_address(buf.end()); + + const auto sig = hat::parse_signature(test_pattern).value(); + const auto ctx = hat::detail::scan_context::create(sig, hat::scan_alignment::X1, hat::scan_hint::none); + for (auto _ : state) { + benchmark::DoNotOptimize(ctx.scan(begin, end)); + } + state.SetBytesProcessed(static_cast(state.iterations() * size)); +} + +static constexpr int64_t rangeStart = 1 << 22; // 4 MiB +static constexpr int64_t rangeLimit = 1 << 28; // 256 MiB + +#define LIBHAT_BENCHMARK(...) BENCHMARK(__VA_ARGS__) \ + ->Threads(1) \ + ->MinWarmUpTime(2) \ + ->MinTime(4) \ + ->Range(rangeStart, rangeLimit) \ + ->UseRealTime(); + +LIBHAT_BENCHMARK(BM_Throughput); +LIBHAT_BENCHMARK(BM_Throughput); + +BENCHMARK_MAIN(); From ee66f4349d1836c94f1a855f14de3f6911bccbb9 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 00:26:00 -0500 Subject: [PATCH 05/37] Log processor name --- .github/workflows/benchmark.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 287a889..fd7915a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -12,6 +12,9 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Check Hardware + run: (Get-CimInstance Win32_Processor).Name + - name: Locate Visual Studio run: | $vsInstall = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath From 77fbf3ebd00525bbc2f0ced61201ab47b7f3a559 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 00:28:31 -0500 Subject: [PATCH 06/37] Only build `libhat_benchmark_compare_impl` --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index fd7915a..dc9971d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -24,7 +24,7 @@ jobs: run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -DLIBHAT_TESTING_SAMPLE_BIN=OFF -A ARM64 -T v145 - name: Build - run: cmake --build ${{github.workspace}}/build -j 4 --config Release + run: cmake --build ${{github.workspace}}/build -j 4 --config Release --target libhat_benchmark_compare_impl - name: Test working-directory: ${{github.workspace}}/build From 71fd67a7a9f32170b36f5f37c5c8d7239a541298 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 00:52:01 -0500 Subject: [PATCH 07/37] Neon substitutions --- src/arch/arm/Neon.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 0968c69..ac237ff 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -22,15 +22,15 @@ namespace hat::detail { - inline void load_signature_128(const signature_view signature, __m128i& bytes, __m128i& mask) { + inline void load_signature_128(const signature_view signature, int8x16_t& bytes, int8x16_t& mask) { std::byte byteBuffer[16]{}; // The remaining signature bytes std::byte maskBuffer[16]{}; // A bitmask for the signature bytes we care about for (size_t i = 0; i < signature.size(); i++) { byteBuffer[i] = signature[i].value(); maskBuffer[i] = signature[i].mask(); } - bytes = _mm_loadu_si128(reinterpret_cast<__m128i*>(&byteBuffer)); - mask = _mm_loadu_si128(reinterpret_cast<__m128i*>(&maskBuffer)); + bytes = vld1q_s8(&byteBuffer); + mask = vld1q_s8(&maskBuffer); } template @@ -39,19 +39,19 @@ namespace hat::detail { const auto cmpIndex = cmpeq2 ? *context.pairIndex : context.cmpIndex; // 128 bit vector containing first signature byte repeated - const auto firstByte = _mm_set1_epi8(static_cast(*signature[cmpIndex])); + const auto firstByte = vdupq_n_s8(static_cast(*signature[cmpIndex])); - __m128i secondByte; + int8x16_t secondByte; if constexpr (cmpeq2) { - secondByte = _mm_set1_epi8(static_cast(*signature[cmpIndex + 1])); + secondByte = vdupq_n_s8(static_cast(*signature[cmpIndex + 1])); } - __m128i signatureBytes, signatureMask; + int8x16_t signatureBytes, signatureMask; if constexpr (veccmp) { load_signature_128(signature, signatureBytes, signatureMask); } - auto [pre, vec, post] = segment_scan<__m128i, veccmp>(begin, end, signature.size(), cmpIndex); + auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); if (!pre.empty()) { const auto result = find_pattern_single(pre.data(), pre.data() + pre.size(), context); @@ -61,11 +61,11 @@ namespace hat::detail { } for (auto& it : vec) { - const auto cmp = _mm_cmpeq_epi8(firstByte, _mm_load_si128(&it)); + const auto cmp = vceqq_s8(firstByte, vld1q_s8(&it)); auto mask = static_cast(_mm_movemask_epi8(cmp)); if constexpr (cmpeq2) { - const auto cmp2 = _mm_cmpeq_epi8(secondByte, _mm_load_si128(&it)); + const auto cmp2 = vceqq_s8(secondByte, vld1q_s8(&it)); auto mask2 = static_cast(_mm_movemask_epi8(cmp2)); mask &= (mask2 >> 1) | (0b1u << 15); } @@ -78,8 +78,8 @@ namespace hat::detail { const auto offset = LIBHAT_BSF32(mask); const auto i = reinterpret_cast(&it) + offset - cmpIndex; if constexpr (veccmp) { - const auto data = _mm_loadu_si128(reinterpret_cast(i)); - const auto neqBits = _mm_xor_si128(data, signatureBytes); + const auto data = vld1q_s8(i); + const auto neqBits = veorq_s32(data, signatureBytes); const auto match = _mm_testz_si128(neqBits, signatureMask); if (match) LIBHAT_UNLIKELY { return i; From d23f6da26bde6c62ea9bfd2bd9ee0a0f5f24bd9a Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 01:05:30 -0500 Subject: [PATCH 08/37] Architecture guard + use time defaults --- test/benchmark/CompareImpl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/benchmark/CompareImpl.cpp b/test/benchmark/CompareImpl.cpp index b49a863..7990a3b 100644 --- a/test/benchmark/CompareImpl.cpp +++ b/test/benchmark/CompareImpl.cpp @@ -36,12 +36,12 @@ static constexpr int64_t rangeLimit = 1 << 28; // 256 MiB #define LIBHAT_BENCHMARK(...) BENCHMARK(__VA_ARGS__) \ ->Threads(1) \ - ->MinWarmUpTime(2) \ - ->MinTime(4) \ ->Range(rangeStart, rangeLimit) \ ->UseRealTime(); LIBHAT_BENCHMARK(BM_Throughput); +#if defined(LIBHAT_AARCH64) || defined(LIBHAT_ARM) LIBHAT_BENCHMARK(BM_Throughput); +#endif BENCHMARK_MAIN(); From c002976932029971a93cea804881d288b1a89ca6 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 01:15:45 -0500 Subject: [PATCH 09/37] Avoid double movemask --- src/arch/arm/Neon.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index ac237ff..36186b7 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -22,15 +22,15 @@ namespace hat::detail { - inline void load_signature_128(const signature_view signature, int8x16_t& bytes, int8x16_t& mask) { + inline void load_signature_128(const signature_view signature, uint8x16_t& bytes, uint8x16_t& mask) { std::byte byteBuffer[16]{}; // The remaining signature bytes std::byte maskBuffer[16]{}; // A bitmask for the signature bytes we care about for (size_t i = 0; i < signature.size(); i++) { byteBuffer[i] = signature[i].value(); maskBuffer[i] = signature[i].mask(); } - bytes = vld1q_s8(&byteBuffer); - mask = vld1q_s8(&maskBuffer); + bytes = vld1q_u8(&byteBuffer); + mask = vld1q_u8(&maskBuffer); } template @@ -39,19 +39,19 @@ namespace hat::detail { const auto cmpIndex = cmpeq2 ? *context.pairIndex : context.cmpIndex; // 128 bit vector containing first signature byte repeated - const auto firstByte = vdupq_n_s8(static_cast(*signature[cmpIndex])); + const auto firstByte = vdupq_n_u8(static_cast(*signature[cmpIndex])); - int8x16_t secondByte; + uint8x16_t secondByte; if constexpr (cmpeq2) { - secondByte = vdupq_n_s8(static_cast(*signature[cmpIndex + 1])); + secondByte = vdupq_n_u8(static_cast(*signature[cmpIndex + 1])); } - int8x16_t signatureBytes, signatureMask; + uint8x16_t signatureBytes, signatureMask; if constexpr (veccmp) { load_signature_128(signature, signatureBytes, signatureMask); } - auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); + auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); if (!pre.empty()) { const auto result = find_pattern_single(pre.data(), pre.data() + pre.size(), context); @@ -61,15 +61,16 @@ namespace hat::detail { } for (auto& it : vec) { - const auto cmp = vceqq_s8(firstByte, vld1q_s8(&it)); - auto mask = static_cast(_mm_movemask_epi8(cmp)); + auto cmp = vceqq_u8(firstByte, vld1q_s8(&it)); if constexpr (cmpeq2) { - const auto cmp2 = vceqq_s8(secondByte, vld1q_s8(&it)); - auto mask2 = static_cast(_mm_movemask_epi8(cmp2)); - mask &= (mask2 >> 1) | (0b1u << 15); + const auto cmp2 = vceqq_u8(secondByte, vld1q_s8(reinterpret_cast(&it) + 1)); + cmp = vandq_u8(cmp, cmp2); } + if (!std::bit_cast(vshrn_n_u16(cmp, 4))) continue; + + auto mask = static_cast(_mm_movemask_epi8(cmp)); if constexpr (alignment != scan_alignment::X1) { mask &= std::rotl(create_alignment_mask(), static_cast(cmpIndex)); } @@ -78,8 +79,8 @@ namespace hat::detail { const auto offset = LIBHAT_BSF32(mask); const auto i = reinterpret_cast(&it) + offset - cmpIndex; if constexpr (veccmp) { - const auto data = vld1q_s8(i); - const auto neqBits = veorq_s32(data, signatureBytes); + const auto data = vld1q_u8(i); + const auto neqBits = veorq_u8(data, signatureBytes); const auto match = _mm_testz_si128(neqBits, signatureMask); if (match) LIBHAT_UNLIKELY { return i; From aa6d5ccd9609957523695c25f642e5c5bbaac090 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 13:49:10 -0500 Subject: [PATCH 10/37] Use 4-bit mask --- src/arch/arm/Neon.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 36186b7..23f6143 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -8,16 +8,16 @@ #ifdef _MSC_VER namespace hat::detail { - inline unsigned long bsf(unsigned long num) noexcept { + inline unsigned long bsf(unsigned __int64 num) noexcept { unsigned long offset; - _BitScanForward(&offset, num); + _BitScanForward64(&offset, num); return offset; } } -#define LIBHAT_BSF32(num) hat::detail::bsf(num) +#define LIBHAT_BSF64(num) hat::detail::bsf(num) #else -#define LIBHAT_BSF32(num) __builtin_ctz(num) +#define LIBHAT_BSF64(num) __builtin_ctzll(num) #endif namespace hat::detail { @@ -33,6 +33,15 @@ namespace hat::detail { mask = vld1q_u8(&maskBuffer); } + template + LIBHAT_FORCEINLINE consteval uint64_t create_alignment_mask_neon() { + uint64_t mask{}; + for (size_t i = 0; i < 16; i += alignment_stride) { + mask |= (static_cast(0xF) << (i * 4)); + } + return mask; + } + template const_scan_result find_pattern_neon(const std::byte* begin, const std::byte* end, const scan_context& context) { const auto signature = context.signature; @@ -68,15 +77,13 @@ namespace hat::detail { cmp = vandq_u8(cmp, cmp2); } - if (!std::bit_cast(vshrn_n_u16(cmp, 4))) continue; - - auto mask = static_cast(_mm_movemask_epi8(cmp)); + auto mask = std::bit_cast(vshrn_n_u16(cmp, 4)); if constexpr (alignment != scan_alignment::X1) { - mask &= std::rotl(create_alignment_mask(), static_cast(cmpIndex)); + mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); } while (mask) { - const auto offset = LIBHAT_BSF32(mask); + const auto offset = LIBHAT_BSF64(mask) / 4; const auto i = reinterpret_cast(&it) + offset - cmpIndex; if constexpr (veccmp) { const auto data = vld1q_u8(i); @@ -91,7 +98,7 @@ namespace hat::detail { return i; } } - mask &= (mask - 1); + mask &= ~(0xF * (mask & -mask)); } } From c7a5b5e997a2a51e856c406e2b8400f4cba6a8e2 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 13:55:21 -0500 Subject: [PATCH 11/37] Fix warning --- src/arch/arm/Neon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 23f6143..efd9ac8 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -98,7 +98,7 @@ namespace hat::detail { return i; } } - mask &= ~(0xF * (mask & -mask)); + mask &= ~(0xF * (mask & -static_cast(mask))); } } From 4ff162a87bda07a6c47a3118d0782eef2237f12c Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 14:08:39 -0500 Subject: [PATCH 12/37] Increase benchmark time --- test/benchmark/CompareImpl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/benchmark/CompareImpl.cpp b/test/benchmark/CompareImpl.cpp index 7990a3b..a7d1250 100644 --- a/test/benchmark/CompareImpl.cpp +++ b/test/benchmark/CompareImpl.cpp @@ -36,6 +36,8 @@ static constexpr int64_t rangeLimit = 1 << 28; // 256 MiB #define LIBHAT_BENCHMARK(...) BENCHMARK(__VA_ARGS__) \ ->Threads(1) \ + ->MinWarmUpTime(1) \ + ->MinTime(2) \ ->Range(rangeStart, rangeLimit) \ ->UseRealTime(); From 2013e929c5626eb1c04734e0cb6e01964962d1bb Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 15:53:11 -0500 Subject: [PATCH 13/37] Remove sse2neon --- CMakeLists.txt | 12 ------------ src/arch/arm/Neon.cpp | 6 ++---- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77fef70..ad66703 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,18 +75,6 @@ set(LIBHAT_SRC add_library(libhat STATIC ${LIBHAT_SRC}) add_library(libhat::libhat ALIAS libhat) -include(FetchContent) -FetchContent_Declare( - sse2neon - GIT_REPOSITORY https://github.com/DLTcollab/sse2neon - GIT_TAG 92f6de174717aef09033ad21568d5bb9e5470404 # v1.9.1 -) -FetchContent_MakeAvailable(sse2neon) -if (MSVC) - set_source_files_properties(src/arch/arm/Neon.cpp PROPERTIES COMPILE_FLAGS "/Zc:preprocessor") -endif() -target_include_directories(libhat SYSTEM PRIVATE ${sse2neon_SOURCE_DIR}) - if(UNIX) set_target_properties(libhat PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index efd9ac8..87d5f3d 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -4,8 +4,6 @@ #include -#include - #ifdef _MSC_VER namespace hat::detail { inline unsigned long bsf(unsigned __int64 num) noexcept { @@ -88,8 +86,8 @@ namespace hat::detail { if constexpr (veccmp) { const auto data = vld1q_u8(i); const auto neqBits = veorq_u8(data, signatureBytes); - const auto match = _mm_testz_si128(neqBits, signatureMask); - if (match) LIBHAT_UNLIKELY { + const auto match = vandq_s64(neqBits, signatureMask); + if (!(vgetq_lane_s64(match, 0) | vgetq_lane_s64(match, 1))) LIBHAT_UNLIKELY { return i; } } else { From 2fa461d7421acff8ac5e9d372227489c12e1ba20 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 15:57:46 -0500 Subject: [PATCH 14/37] Missing neon header include --- src/arch/arm/Neon.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 87d5f3d..fe7549f 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -4,6 +4,8 @@ #include +#include + #ifdef _MSC_VER namespace hat::detail { inline unsigned long bsf(unsigned __int64 num) noexcept { From 1e42ef4d2e0d55d68e61a21e271a1749ffeded57 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 16:02:52 -0500 Subject: [PATCH 15/37] Improve neon intrinsic type conformance --- src/arch/arm/Neon.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index fe7549f..f772a39 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -23,11 +23,11 @@ namespace hat::detail { inline void load_signature_128(const signature_view signature, uint8x16_t& bytes, uint8x16_t& mask) { - std::byte byteBuffer[16]{}; // The remaining signature bytes - std::byte maskBuffer[16]{}; // A bitmask for the signature bytes we care about + uint8_t byteBuffer[16]{}; // The remaining signature bytes + uint8_t maskBuffer[16]{}; // A bitmask for the signature bytes we care about for (size_t i = 0; i < signature.size(); i++) { - byteBuffer[i] = signature[i].value(); - maskBuffer[i] = signature[i].mask(); + byteBuffer[i] = std::to_integer(signature[i].value()); + maskBuffer[i] = std::to_integer(signature[i].mask()); } bytes = vld1q_u8(&byteBuffer); mask = vld1q_u8(&maskBuffer); @@ -48,11 +48,11 @@ namespace hat::detail { const auto cmpIndex = cmpeq2 ? *context.pairIndex : context.cmpIndex; // 128 bit vector containing first signature byte repeated - const auto firstByte = vdupq_n_u8(static_cast(*signature[cmpIndex])); + const auto firstByte = vdupq_n_u8(static_cast(*signature[cmpIndex])); uint8x16_t secondByte; if constexpr (cmpeq2) { - secondByte = vdupq_n_u8(static_cast(*signature[cmpIndex + 1])); + secondByte = vdupq_n_u8(static_cast(*signature[cmpIndex + 1])); } uint8x16_t signatureBytes, signatureMask; @@ -70,10 +70,10 @@ namespace hat::detail { } for (auto& it : vec) { - auto cmp = vceqq_u8(firstByte, vld1q_s8(&it)); + auto cmp = vceqq_u8(firstByte, vld1q_u8(reinterpret_cast(&it))); if constexpr (cmpeq2) { - const auto cmp2 = vceqq_u8(secondByte, vld1q_s8(reinterpret_cast(&it) + 1)); + const auto cmp2 = vceqq_u8(secondByte, vld1q_u8(reinterpret_cast(&it) + 1)); cmp = vandq_u8(cmp, cmp2); } @@ -86,10 +86,10 @@ namespace hat::detail { const auto offset = LIBHAT_BSF64(mask) / 4; const auto i = reinterpret_cast(&it) + offset - cmpIndex; if constexpr (veccmp) { - const auto data = vld1q_u8(i); + const auto data = vld1q_u8(reinterpret_cast(i)); const auto neqBits = veorq_u8(data, signatureBytes); - const auto match = vandq_s64(neqBits, signatureMask); - if (!(vgetq_lane_s64(match, 0) | vgetq_lane_s64(match, 1))) LIBHAT_UNLIKELY { + const auto match = vandq_u8(neqBits, signatureMask); + if (!(vgetq_lane_u64(match, 0) | vgetq_lane_u64(match, 1))) LIBHAT_UNLIKELY { return i; } } else { From 826b640edb083c9cde686b814bba0aaeda41df6f Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 16:10:10 -0500 Subject: [PATCH 16/37] Hopefully resolve remaining macOS compile errors --- src/arch/arm/Neon.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index f772a39..5924108 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -29,8 +29,8 @@ namespace hat::detail { byteBuffer[i] = std::to_integer(signature[i].value()); maskBuffer[i] = std::to_integer(signature[i].mask()); } - bytes = vld1q_u8(&byteBuffer); - mask = vld1q_u8(&maskBuffer); + bytes = vld1q_u8(static_cast(byteBuffer)); + mask = vld1q_u8(static_cast(maskBuffer)); } template @@ -98,7 +98,7 @@ namespace hat::detail { return i; } } - mask &= ~(0xF * (mask & -static_cast(mask))); + mask &= ~(0xF * (mask & (~mask + 1))); } } From 82a1517420acf733da06afb4fbf55af3cce78a64 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 16:25:56 -0500 Subject: [PATCH 17/37] Compile Windows on ARM benchmark using Clang --- .github/workflows/benchmark.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index dc9971d..771d0ea 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -8,6 +8,9 @@ on: jobs: windows: + strategy: + matrix: + toolset: [ v145, ClangCL ] runs-on: windows-11-vs2026-arm steps: - uses: actions/checkout@v6 @@ -21,7 +24,7 @@ jobs: echo "VS_INSTALL_DIR=$vsInstall" >> $env:GITHUB_ENV - name: Configure - run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -DLIBHAT_TESTING_SAMPLE_BIN=OFF -A ARM64 -T v145 + run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -DLIBHAT_TESTING_SAMPLE_BIN=OFF -A ARM64 -T ${{matrix.toolset}} - name: Build run: cmake --build ${{github.workspace}}/build -j 4 --config Release --target libhat_benchmark_compare_impl From ce4768ebb61ea5f9a11d402dcbcd0e7f14cd3c07 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 16:42:56 -0500 Subject: [PATCH 18/37] Test manual bitwise optimization --- src/arch/arm/Neon.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 5924108..668385a 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -98,7 +98,10 @@ namespace hat::detail { return i; } } - mask &= ~(0xF * (mask & (~mask + 1))); + // thanks msvc? + // mask &= ~(0xF * (mask & (~mask + 1))); + const auto lsb = (mask & static_cast(-static_cast(mask))); + mask &= ~((lsb << 4) - lsb); } } From 16f2eed35f00684db6fc7eacea4908f27c0c7917 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 17:44:10 -0500 Subject: [PATCH 19/37] Benchmark ARM Neon against Chromium --- .github/workflows/benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 771d0ea..c7599d4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -24,14 +24,14 @@ jobs: echo "VS_INSTALL_DIR=$vsInstall" >> $env:GITHUB_ENV - name: Configure - run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -DLIBHAT_TESTING_SAMPLE_BIN=OFF -A ARM64 -T ${{matrix.toolset}} + run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -A ARM64 -T ${{matrix.toolset}} - name: Build - run: cmake --build ${{github.workspace}}/build -j 4 --config Release --target libhat_benchmark_compare_impl + run: cmake --build ${{github.workspace}}/build -j 4 --config Release --target libhat_benchmark_compare_impl libhat_benchmark_chromium - name: Test working-directory: ${{github.workspace}}/build shell: cmd run: | call "${{env.VS_INSTALL_DIR}}\VC\Auxiliary\Build\vcvarsarm64.bat" - ctest --verbose -C Release -R libhat_benchmark_compare_impl + ctest --verbose -C Release -R "(libhat_benchmark_compare_impl|libhat_benchmark_chromium)" From af7e14ff7871ff057bd9fbd12d66bb1910cc8363 Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 17:45:58 -0500 Subject: [PATCH 20/37] Use cache for benchmark workflow --- .github/workflows/benchmark.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c7599d4..ed4624b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -15,6 +15,14 @@ jobs: steps: - uses: actions/checkout@v6 + - name: CPM Cache + uses: actions/cache@v5 + with: + path: ${{github.workspace}}/.cpmcache + key: cpm-${{runner.os}}-ARM64-${{hashFiles('test/CMakeLists.txt')}} + restore-keys: | + cpm-${{runner.os}}-ARM64- + - name: Check Hardware run: (Get-CimInstance Win32_Processor).Name From 45a85a2a03d408c626a77895f3b73585375c984b Mon Sep 17 00:00:00 2001 From: Brady Date: Mon, 15 Jun 2026 18:20:15 -0500 Subject: [PATCH 21/37] Perhaps --- src/arch/arm/Neon.cpp | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 668385a..b4ed8fa 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -60,7 +60,7 @@ namespace hat::detail { load_signature_128(signature, signatureBytes, signatureMask); } - auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); + auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); if (!pre.empty()) { const auto result = find_pattern_single(pre.data(), pre.data() + pre.size(), context); @@ -70,16 +70,21 @@ namespace hat::detail { } for (auto& it : vec) { - auto cmp = vceqq_u8(firstByte, vld1q_u8(reinterpret_cast(&it))); + auto data = vld1q_u8_x2(reinterpret_cast(&it)); + auto cmp = vceqq_u8(firstByte, data.val[0]); + auto cmp2 = vceqq_u8(firstByte, data.val[1]); if constexpr (cmpeq2) { - const auto cmp2 = vceqq_u8(secondByte, vld1q_u8(reinterpret_cast(&it) + 1)); - cmp = vandq_u8(cmp, cmp2); + auto data2 = vld1q_u8_x2(reinterpret_cast(&it) + 1); + cmp = vandq_u8(cmp, vceqq_u8(secondByte, data2.val[0])); + cmp2 = vandq_u8(cmp2, vceqq_u8(secondByte, data2.val[1])); } auto mask = std::bit_cast(vshrn_n_u16(cmp, 4)); + auto mask2 = std::bit_cast(vshrn_n_u16(cmp2, 4)); if constexpr (alignment != scan_alignment::X1) { mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); + mask2 &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); } while (mask) { @@ -103,6 +108,28 @@ namespace hat::detail { const auto lsb = (mask & static_cast(-static_cast(mask))); mask &= ~((lsb << 4) - lsb); } + + while (mask2) { + const auto offset = LIBHAT_BSF64(mask2) / 4 + 16; + const auto i = reinterpret_cast(&it) + offset - cmpIndex; + if constexpr (veccmp) { + const auto data = vld1q_u8(reinterpret_cast(i)); + const auto neqBits = veorq_u8(data, signatureBytes); + const auto match = vandq_u8(neqBits, signatureMask); + if (!(vgetq_lane_u64(match, 0) | vgetq_lane_u64(match, 1))) LIBHAT_UNLIKELY { + return i; + } + } else { + const auto match = std::equal(signature.begin(), signature.end(), i); + if (match) LIBHAT_UNLIKELY { + return i; + } + } + // thanks msvc? + // mask &= ~(0xF * (mask & (~mask + 1))); + const auto lsb = (mask2 & static_cast(-static_cast(mask2))); + mask2 &= ~((lsb << 4) - lsb); + } } if (!post.empty()) { From af24489bf1bcbd7d1d892dc9b87fc219a8841146 Mon Sep 17 00:00:00 2001 From: Brady Date: Wed, 17 Jun 2026 01:46:17 -0500 Subject: [PATCH 22/37] Extract masks using `vget_lane_u64` --- src/arch/arm/Neon.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index b4ed8fa..6c58f7f 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -80,8 +80,8 @@ namespace hat::detail { cmp2 = vandq_u8(cmp2, vceqq_u8(secondByte, data2.val[1])); } - auto mask = std::bit_cast(vshrn_n_u16(cmp, 4)); - auto mask2 = std::bit_cast(vshrn_n_u16(cmp2, 4)); + auto mask = vget_lane_u64(vshrn_n_u16(cmp, 4), 0); + auto mask2 = vget_lane_u64(vshrn_n_u16(cmp2, 4), 0); if constexpr (alignment != scan_alignment::X1) { mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); mask2 &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); From 36991044e9143def4027f2ade72e87127d60c5cb Mon Sep 17 00:00:00 2001 From: Brady Date: Wed, 17 Jun 2026 04:08:53 -0500 Subject: [PATCH 23/37] Debug mode optimizations for NEON --- src/arch/arm/Neon.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 6c58f7f..bda7c6d 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -69,13 +69,15 @@ namespace hat::detail { } } - for (auto& it : vec) { - auto data = vld1q_u8_x2(reinterpret_cast(&it)); + const auto vec_begin = std::to_address(vec.begin()); + const auto vec_end = std::to_address(vec.end()); + for (auto it = vec_begin; it != vec_end; it++) { + auto data = vld1q_u8_x2(reinterpret_cast(it)); auto cmp = vceqq_u8(firstByte, data.val[0]); auto cmp2 = vceqq_u8(firstByte, data.val[1]); if constexpr (cmpeq2) { - auto data2 = vld1q_u8_x2(reinterpret_cast(&it) + 1); + auto data2 = vld1q_u8_x2(reinterpret_cast(it) + 1); cmp = vandq_u8(cmp, vceqq_u8(secondByte, data2.val[0])); cmp2 = vandq_u8(cmp2, vceqq_u8(secondByte, data2.val[1])); } @@ -89,7 +91,7 @@ namespace hat::detail { while (mask) { const auto offset = LIBHAT_BSF64(mask) / 4; - const auto i = reinterpret_cast(&it) + offset - cmpIndex; + const auto i = reinterpret_cast(it) + offset - cmpIndex; if constexpr (veccmp) { const auto data = vld1q_u8(reinterpret_cast(i)); const auto neqBits = veorq_u8(data, signatureBytes); @@ -111,7 +113,7 @@ namespace hat::detail { while (mask2) { const auto offset = LIBHAT_BSF64(mask2) / 4 + 16; - const auto i = reinterpret_cast(&it) + offset - cmpIndex; + const auto i = reinterpret_cast(it) + offset - cmpIndex; if constexpr (veccmp) { const auto data = vld1q_u8(reinterpret_cast(i)); const auto neqBits = veorq_u8(data, signatureBytes); From 21696888b9649a4cac365b3ba7f88d7e5c301c3b Mon Sep 17 00:00:00 2001 From: Brady Date: Wed, 17 Jun 2026 16:05:00 -0500 Subject: [PATCH 24/37] Revert unrolling --- src/arch/arm/Neon.cpp | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index bda7c6d..9d4c555 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -60,7 +60,7 @@ namespace hat::detail { load_signature_128(signature, signatureBytes, signatureMask); } - auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); + auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); if (!pre.empty()) { const auto result = find_pattern_single(pre.data(), pre.data() + pre.size(), context); @@ -72,21 +72,16 @@ namespace hat::detail { const auto vec_begin = std::to_address(vec.begin()); const auto vec_end = std::to_address(vec.end()); for (auto it = vec_begin; it != vec_end; it++) { - auto data = vld1q_u8_x2(reinterpret_cast(it)); - auto cmp = vceqq_u8(firstByte, data.val[0]); - auto cmp2 = vceqq_u8(firstByte, data.val[1]); + auto cmp = vceqq_u8(firstByte, vld1q_u8(reinterpret_cast(it))); if constexpr (cmpeq2) { - auto data2 = vld1q_u8_x2(reinterpret_cast(it) + 1); - cmp = vandq_u8(cmp, vceqq_u8(secondByte, data2.val[0])); - cmp2 = vandq_u8(cmp2, vceqq_u8(secondByte, data2.val[1])); + const auto cmp2 = vceqq_u8(secondByte, vld1q_u8(reinterpret_cast(it) + 1)); + cmp = vandq_u8(cmp, cmp2); } auto mask = vget_lane_u64(vshrn_n_u16(cmp, 4), 0); - auto mask2 = vget_lane_u64(vshrn_n_u16(cmp2, 4), 0); if constexpr (alignment != scan_alignment::X1) { mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); - mask2 &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); } while (mask) { @@ -110,28 +105,6 @@ namespace hat::detail { const auto lsb = (mask & static_cast(-static_cast(mask))); mask &= ~((lsb << 4) - lsb); } - - while (mask2) { - const auto offset = LIBHAT_BSF64(mask2) / 4 + 16; - const auto i = reinterpret_cast(it) + offset - cmpIndex; - if constexpr (veccmp) { - const auto data = vld1q_u8(reinterpret_cast(i)); - const auto neqBits = veorq_u8(data, signatureBytes); - const auto match = vandq_u8(neqBits, signatureMask); - if (!(vgetq_lane_u64(match, 0) | vgetq_lane_u64(match, 1))) LIBHAT_UNLIKELY { - return i; - } - } else { - const auto match = std::equal(signature.begin(), signature.end(), i); - if (match) LIBHAT_UNLIKELY { - return i; - } - } - // thanks msvc? - // mask &= ~(0xF * (mask & (~mask + 1))); - const auto lsb = (mask2 & static_cast(-static_cast(mask2))); - mask2 &= ~((lsb << 4) - lsb); - } } if (!post.empty()) { From df8bb55de589bae15713e19212ef72b062034f8b Mon Sep 17 00:00:00 2001 From: Brady Date: Wed, 17 Jun 2026 21:22:09 -0500 Subject: [PATCH 25/37] Add macOS to ARM64 testing --- .github/workflows/cmake.yml | 16 ++++++++++++++-- src/arch/arm/System.cpp | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 0b74308..20ea718 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -18,12 +18,24 @@ jobs: steps: - uses: actions/checkout@v6 + - name: CPM Cache + uses: actions/cache@v5 + with: + path: ${{github.workspace}}/.cpmcache + key: cpm-${{runner.os}}-${{hashFiles('test/CMakeLists.txt')}} + restore-keys: | + cpm-${{runner.os}}- + - name: Configure - run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING=OFF + run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF -DLIBHAT_TESTING_SDE=OFF - name: Build run: cmake --build ${{github.workspace}}/build -j 4 + - name: Test + working-directory: ${{github.workspace}}/build + run: ctest --verbose -C ${{env.BUILD_TYPE}} -R libhat_test_.* + linux: strategy: matrix: @@ -51,7 +63,7 @@ jobs: - name: Configure env: CXX: ${{matrix.compiler.exe}}-${{matrix.compiler.version}} - run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING=ON -DLIBHAT_TESTING_SAMPLE_BIN=OFF + run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF - name: Build run: cmake --build ${{github.workspace}}/build -j 4 diff --git a/src/arch/arm/System.cpp b/src/arch/arm/System.cpp index 6c263dd..832e6f0 100644 --- a/src/arch/arm/System.cpp +++ b/src/arch/arm/System.cpp @@ -5,7 +5,7 @@ namespace hat { -#ifdef LIBHAT_WINDOWS +#if defined(LIBHAT_WINDOWS) || defined(LIBHAT_MAC) system_info_arm::system_info_arm() { this->extensions.neon = true; } From 36b05c7b6e9c516819b476b56800c221ef3310bd Mon Sep 17 00:00:00 2001 From: Imrglop <69129770+Imrglop@users.noreply.github.com> Date: Sun, 31 May 2026 07:46:09 -0700 Subject: [PATCH 26/37] Partial mach-o Process --- CMakeLists.txt | 2 + src/os/linux/Process.cpp | 8 ---- src/os/mac/Process.cpp | 95 ++++++++++++++++++++++++++++++++++++++++ src/os/unix/Process.cpp | 19 ++++++++ 4 files changed, 116 insertions(+), 8 deletions(-) create mode 100644 src/os/mac/Process.cpp create mode 100644 src/os/unix/Process.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ad66703..c232b2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,8 @@ set(LIBHAT_SRC src/Scanner.cpp src/System.cpp + src/os/mac/Process.cpp + src/os/linux/MemoryProtector.cpp src/os/linux/Process.cpp diff --git a/src/os/linux/Process.cpp b/src/os/linux/Process.cpp index 7fc1eaa..053d7e2 100644 --- a/src/os/linux/Process.cpp +++ b/src/os/linux/Process.cpp @@ -13,14 +13,6 @@ namespace hat::process { - hat::process::module get_process_module() { - const auto module = get_module({}); - if (!module) { - std::abort(); - } - return *module; - } - std::span module::get_module_data() const { size_t max{}; diff --git a/src/os/mac/Process.cpp b/src/os/mac/Process.cpp new file mode 100644 index 0000000..b7718a8 --- /dev/null +++ b/src/os/mac/Process.cpp @@ -0,0 +1,95 @@ +#include +#ifdef LIBHAT_MAC + +#include + +#include +#include +#include + +#include + +#include + +namespace hat::process { + + // no-op on 32 bit binaries + void module::for_each_segment(const std::function, hat::protection)>& callback) const { + const uint32_t imageCount = _dyld_image_count(); + for (uint32_t i = 0; i < imageCount; i++) { + const auto* header = reinterpret_cast(_dyld_get_image_header(i)); + if (header == nullptr) { + continue; + } + if (std::bit_cast(header) != this->address()) { + continue; + } + + const auto slide = static_cast(_dyld_get_image_vmaddr_slide(i)); + const auto* cmd = reinterpret_cast( + reinterpret_cast(header) + sizeof(mach_header_64)); + + for (uint32_t j = 0; j < header->ncmds; j++) { + if (cmd->cmd == LC_SEGMENT_64) { + const auto* seg = reinterpret_cast(cmd); + + // skip __PAGEZERO and any unmapped segment + if (seg->vmsize != 0 && seg->initprot != 0) { + const std::span data{ + reinterpret_cast(seg->vmaddr + slide), + seg->vmsize + }; + + hat::protection prot{}; + if (seg->initprot & VM_PROT_READ) prot |= hat::protection::Read; + if (seg->initprot & VM_PROT_WRITE) prot |= hat::protection::Write; + if (seg->initprot & VM_PROT_EXECUTE) prot |= hat::protection::Execute; + + if (!callback(data, prot)) { + return; + } + } + } + cmd = reinterpret_cast( + reinterpret_cast(cmd) + cmd->cmdsize); + } + return; + } + } + + std::optional get_module(const std::string_view name) { + using Handle = std::unique_ptr; + + std::unique_ptr buffer; + + if (!name.empty()) { + buffer = std::make_unique(name.size() + 1); + std::ranges::copy(name, buffer.get()); + } + + const Handle handle{dlopen(buffer.get(), RTLD_LAZY | RTLD_NOLOAD)}; + if (!handle) { + return {}; + } + + std::optional module{}; + + uint32_t imageCount = _dyld_image_count(); + for (uint32_t i = 0; i < imageCount; i++) { + const auto* header = reinterpret_cast(_dyld_get_image_header(i)); + if (header == nullptr) + continue; + + const Handle h{dlopen(_dyld_get_image_name(i), RTLD_LAZY | RTLD_NOLOAD)}; + if (h == handle) { + module = hat::process::module{std::bit_cast(header)}; + } + } + + return module; + } +} + +#endif diff --git a/src/os/unix/Process.cpp b/src/os/unix/Process.cpp new file mode 100644 index 0000000..e3bd5e5 --- /dev/null +++ b/src/os/unix/Process.cpp @@ -0,0 +1,19 @@ +#include +#ifdef LIBHAT_UNIX + +#include + +#include + +namespace hat::process { + + hat::process::module get_process_module() { + const auto module = get_module({}); + if (!module) { + std::abort(); + } + return *module; + } +} + +#endif From ac4712e943587c99a1e16400c89602b6f8081194 Mon Sep 17 00:00:00 2001 From: Brady Date: Wed, 17 Jun 2026 22:13:10 -0500 Subject: [PATCH 27/37] Missing source file --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c232b2b..733db1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,7 @@ set(LIBHAT_SRC src/os/linux/MemoryProtector.cpp src/os/linux/Process.cpp + src/os/unix/Process.cpp src/os/unix/System.cpp src/os/win32/MemoryProtector.cpp From 7e7c29d771cb012a7ccb739ab551ab21b4e5d8ea Mon Sep 17 00:00:00 2001 From: Brady Date: Wed, 17 Jun 2026 22:23:43 -0500 Subject: [PATCH 28/37] Missing include for `std::abort` --- src/os/unix/Process.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/os/unix/Process.cpp b/src/os/unix/Process.cpp index e3bd5e5..3307161 100644 --- a/src/os/unix/Process.cpp +++ b/src/os/unix/Process.cpp @@ -5,6 +5,8 @@ #include +#include + namespace hat::process { hat::process::module get_process_module() { From afc7c1d7832f95af11d8efecfeb9760d411d8d71 Mon Sep 17 00:00:00 2001 From: Brady Date: Thu, 18 Jun 2026 00:50:53 -0500 Subject: [PATCH 29/37] Possibly fix macOS process APIs --- CMakeLists.txt | 1 - src/os/linux/Process.cpp | 8 ++++++++ src/os/mac/Process.cpp | 35 +++++++++++++++++++++-------------- src/os/unix/Process.cpp | 21 --------------------- 4 files changed, 29 insertions(+), 36 deletions(-) delete mode 100644 src/os/unix/Process.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 733db1b..c232b2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,6 @@ set(LIBHAT_SRC src/os/linux/MemoryProtector.cpp src/os/linux/Process.cpp - src/os/unix/Process.cpp src/os/unix/System.cpp src/os/win32/MemoryProtector.cpp diff --git a/src/os/linux/Process.cpp b/src/os/linux/Process.cpp index 053d7e2..7fc1eaa 100644 --- a/src/os/linux/Process.cpp +++ b/src/os/linux/Process.cpp @@ -13,6 +13,14 @@ namespace hat::process { + hat::process::module get_process_module() { + const auto module = get_module({}); + if (!module) { + std::abort(); + } + return *module; + } + std::span module::get_module_data() const { size_t max{}; diff --git a/src/os/mac/Process.cpp b/src/os/mac/Process.cpp index b7718a8..cb931c4 100644 --- a/src/os/mac/Process.cpp +++ b/src/os/mac/Process.cpp @@ -13,6 +13,17 @@ namespace hat::process { + hat::process::module get_process_module() { + const uint32_t count = _dyld_image_count(); + for (uint32_t i = 0; i != count; i++) { + const auto* header = reinterpret_cast(_dyld_get_image_header(i)); + if (header && header->filetype == MH_EXECUTE) { + return hat::process::module{std::bit_cast(header)}; + } + } + std::abort(); + } + // no-op on 32 bit binaries void module::for_each_segment(const std::function, hat::protection)>& callback) const { const uint32_t imageCount = _dyld_image_count(); @@ -58,37 +69,33 @@ namespace hat::process { } std::optional get_module(const std::string_view name) { + if (name.empty()) { + return get_process_module(); + } + using Handle = std::unique_ptr; - std::unique_ptr buffer; - - if (!name.empty()) { - buffer = std::make_unique(name.size() + 1); - std::ranges::copy(name, buffer.get()); - } - - const Handle handle{dlopen(buffer.get(), RTLD_LAZY | RTLD_NOLOAD)}; + const std::string buffer{name}; + const Handle handle{dlopen(buffer.c_str(), RTLD_LAZY | RTLD_NOLOAD)}; if (!handle) { return {}; } - - std::optional module{}; - uint32_t imageCount = _dyld_image_count(); - for (uint32_t i = 0; i < imageCount; i++) { + const uint32_t count = _dyld_image_count(); + for (uint32_t i = 0; i < count; i++) { const auto* header = reinterpret_cast(_dyld_get_image_header(i)); if (header == nullptr) continue; const Handle h{dlopen(_dyld_get_image_name(i), RTLD_LAZY | RTLD_NOLOAD)}; if (h == handle) { - module = hat::process::module{std::bit_cast(header)}; + return hat::process::module{std::bit_cast(header)}; } } - return module; + return {}; } } diff --git a/src/os/unix/Process.cpp b/src/os/unix/Process.cpp deleted file mode 100644 index 3307161..0000000 --- a/src/os/unix/Process.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include -#ifdef LIBHAT_UNIX - -#include - -#include - -#include - -namespace hat::process { - - hat::process::module get_process_module() { - const auto module = get_module({}); - if (!module) { - std::abort(); - } - return *module; - } -} - -#endif From 3cec4918d2bb8f599ff9dfc5c645ad33af7ca152 Mon Sep 17 00:00:00 2001 From: Brady Date: Thu, 18 Jun 2026 04:18:50 -0500 Subject: [PATCH 30/37] Add Linux to ARM64 testing --- .github/workflows/cmake.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 20ea718..e3cb6d5 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -39,11 +39,12 @@ jobs: linux: strategy: matrix: + os: [ ubuntu-26.04, ubuntu-26.04-arm ] compiler: - { pkg: g++, exe: g++, version: 14 } - { pkg: clang, exe: clang++, version: 18 } cxx_standard: [ 20, 23 ] - runs-on: ubuntu-24.04 + runs-on: ${{matrix.os}} steps: - uses: actions/checkout@v6 @@ -63,7 +64,7 @@ jobs: - name: Configure env: CXX: ${{matrix.compiler.exe}}-${{matrix.compiler.version}} - run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF + run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SDE=${{contains(matrix.os, 'arm') && 'OFF' || 'ON'}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF - name: Build run: cmake --build ${{github.workspace}}/build -j 4 From 7663ec0c42c2e4b3775c73020ef1a344a3a4b9af Mon Sep 17 00:00:00 2001 From: Brady Date: Thu, 18 Jun 2026 04:24:37 -0500 Subject: [PATCH 31/37] Fix Linux compile --- src/arch/arm/Neon.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 9d4c555..752aa1c 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -79,7 +79,7 @@ namespace hat::detail { cmp = vandq_u8(cmp, cmp2); } - auto mask = vget_lane_u64(vshrn_n_u16(cmp, 4), 0); + auto mask = vget_lane_u64(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4), 0); if constexpr (alignment != scan_alignment::X1) { mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); } @@ -90,7 +90,7 @@ namespace hat::detail { if constexpr (veccmp) { const auto data = vld1q_u8(reinterpret_cast(i)); const auto neqBits = veorq_u8(data, signatureBytes); - const auto match = vandq_u8(neqBits, signatureMask); + const auto match = vreinterpretq_u64_u8(vandq_u8(neqBits, signatureMask)); if (!(vgetq_lane_u64(match, 0) | vgetq_lane_u64(match, 1))) LIBHAT_UNLIKELY { return i; } From bf19f5314bf9f80930e3a77febfa388c3c42054e Mon Sep 17 00:00:00 2001 From: Brady Date: Thu, 18 Jun 2026 04:30:30 -0500 Subject: [PATCH 32/37] Actually fix compile + adjust workflow --- .github/workflows/cmake.yml | 11 ++++++++--- src/arch/arm/Neon.cpp | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index e3cb6d5..d3e46ce 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -39,11 +39,16 @@ jobs: linux: strategy: matrix: - os: [ ubuntu-26.04, ubuntu-26.04-arm ] + target: [ x64, ARM64 ] + cxx_standard: [ 20, 23 ] compiler: - { pkg: g++, exe: g++, version: 14 } - { pkg: clang, exe: clang++, version: 18 } - cxx_standard: [ 20, 23 ] + include: + - target: x64 + os: ubuntu-26.04 + - target: ARM64 + os: ubuntu-26.04-arm runs-on: ${{matrix.os}} steps: - uses: actions/checkout@v6 @@ -64,7 +69,7 @@ jobs: - name: Configure env: CXX: ${{matrix.compiler.exe}}-${{matrix.compiler.version}} - run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SDE=${{contains(matrix.os, 'arm') && 'OFF' || 'ON'}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF + run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SDE=${{startsWith(matrix.target, 'ARM') && 'OFF' || 'ON'}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF - name: Build run: cmake --build ${{github.workspace}}/build -j 4 diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 752aa1c..1a34551 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -79,7 +79,7 @@ namespace hat::detail { cmp = vandq_u8(cmp, cmp2); } - auto mask = vget_lane_u64(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4), 0); + auto mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4)), 0); if constexpr (alignment != scan_alignment::X1) { mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); } From 3671c7f1e5ead424482125fab3abc9f4aab5c34d Mon Sep 17 00:00:00 2001 From: Brady Date: Thu, 18 Jun 2026 04:42:39 -0500 Subject: [PATCH 33/37] Implement `system_info_arm` on Linux --- src/arch/arm/System.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/arch/arm/System.cpp b/src/arch/arm/System.cpp index 832e6f0..f6c439a 100644 --- a/src/arch/arm/System.cpp +++ b/src/arch/arm/System.cpp @@ -3,13 +3,30 @@ #include -namespace hat { - #if defined(LIBHAT_WINDOWS) || defined(LIBHAT_MAC) +namespace hat { system_info_arm::system_info_arm() { this->extensions.neon = true; } +} #endif +#if defined(LIBHAT_LINUX) + +#include +#include + +namespace hat { + system_info_arm::system_info_arm() { +#if defined(LIBHAT_ARM) + unsigned long hwcap = getauxval(AT_HWCAP); + this->extensions.neon = (hwcap & HWCAP_NEON) != 0; +#else // AARCH64 + unsigned long hwcap = getauxval(AT_HWCAP); + this->extensions.neon = (hwcap & HWCAP_ASIMD) != 0; +#endif + } } #endif + +#endif From b747018cc68eb7c6937477b3bb9810194a165c46 Mon Sep 17 00:00:00 2001 From: Brady Date: Fri, 19 Jun 2026 18:04:31 -0500 Subject: [PATCH 34/37] Adjust mask iteration --- src/arch/arm/Neon.cpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 1a34551..175f11a 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -5,17 +5,10 @@ #include #include +#include #ifdef _MSC_VER - namespace hat::detail { - inline unsigned long bsf(unsigned __int64 num) noexcept { - unsigned long offset; - _BitScanForward64(&offset, num); - return offset; - } - } - -#define LIBHAT_BSF64(num) hat::detail::bsf(num) +#define LIBHAT_BSF64(num) _CountTrailingZeros64(num) #else #define LIBHAT_BSF64(num) __builtin_ctzll(num) #endif @@ -85,8 +78,8 @@ namespace hat::detail { } while (mask) { - const auto offset = LIBHAT_BSF64(mask) / 4; - const auto i = reinterpret_cast(it) + offset - cmpIndex; + const auto offset = LIBHAT_BSF64(mask); + const auto i = reinterpret_cast(it) + (offset >> 2) - cmpIndex; if constexpr (veccmp) { const auto data = vld1q_u8(reinterpret_cast(i)); const auto neqBits = veorq_u8(data, signatureBytes); @@ -102,8 +95,7 @@ namespace hat::detail { } // thanks msvc? // mask &= ~(0xF * (mask & (~mask + 1))); - const auto lsb = (mask & static_cast(-static_cast(mask))); - mask &= ~((lsb << 4) - lsb); + mask ^= (uint64_t{0xF} << offset); } } From 9ae6cfc2d55316a8b43a761112b0da82dfecdddd Mon Sep 17 00:00:00 2001 From: Brady Date: Fri, 19 Jun 2026 18:10:15 -0500 Subject: [PATCH 35/37] ALLEGEDLY I can't use `_CountTrailingZeros64` --- src/arch/arm/Neon.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp index 175f11a..8d65bcf 100644 --- a/src/arch/arm/Neon.cpp +++ b/src/arch/arm/Neon.cpp @@ -5,10 +5,19 @@ #include #include -#include #ifdef _MSC_VER -#define LIBHAT_BSF64(num) _CountTrailingZeros64(num) +#include + + namespace hat::detail { + inline unsigned long bsf(unsigned __int64 num) noexcept { + unsigned long offset; + _BitScanForward64(&offset, num); + return offset; + } + } + +#define LIBHAT_BSF64(num) hat::detail::bsf(num) #else #define LIBHAT_BSF64(num) __builtin_ctzll(num) #endif From 5ddba36ec0ea8770c2e9c6d2dd6e3e5ca6cac536 Mon Sep 17 00:00:00 2001 From: Brady Date: Fri, 19 Jun 2026 19:34:01 -0500 Subject: [PATCH 36/37] Optimize `frequency.py` using NumPy --- scripts/frequency.py | 11 ++++++++--- scripts/requirements.txt | Bin 40 -> 68 bytes 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/frequency.py b/scripts/frequency.py index 9f75422..0bc9cef 100644 --- a/scripts/frequency.py +++ b/scripts/frequency.py @@ -1,13 +1,14 @@ import itertools import sys +import numpy as np import pefile def main(): files = [pefile.PE(path, fast_load=True) for path in sys.argv[1:]] - pair_counts = [0 for _ in range(2 ** 16)] + pair_counts = np.zeros(1 << 16, dtype=np.int64) total_pairs_count = 0 for pe in files: @@ -16,8 +17,12 @@ def main(): continue data = section.get_data() total_pairs_count += len(data) - 1 - for a, b in zip(data[:], data[1:]): - pair_counts[a * 0x100 + b] += 1 + + a = np.frombuffer(data[:-1], dtype=np.uint8).astype(np.uint16) + b = np.frombuffer(data[1:], dtype=np.uint8) + pairs = (a << 8) | b + count = np.bincount(pairs, minlength=1 << 16) + pair_counts += count top_n_pairs = 512 sorted_pairs = sorted( diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 04a2c9b797c42e060d1f7d6cf6957371ae4f708d..54ea379ae8bcb104806a7089af3ad9055c472606 100644 GIT binary patch literal 68 zcmezWFOQ*=A(x?mp_0KC2#px@7)*fJjDeSd3n-e(kj9Y7kOQP4vIamEAbATQHiD`G E0MdsFiU0rr delta 8 PcmZ?KVEX@WqMRB44paj{ From 3d1e9ea35703ffb0bbaeec44f0a58836c9a3b040 Mon Sep 17 00:00:00 2001 From: Brady Date: Fri, 19 Jun 2026 19:38:03 -0500 Subject: [PATCH 37/37] Avoid `bytes` copy with `memoryview` --- scripts/frequency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/frequency.py b/scripts/frequency.py index 0bc9cef..258fa7e 100644 --- a/scripts/frequency.py +++ b/scripts/frequency.py @@ -15,7 +15,7 @@ def main(): for section in pe.sections: if not section.IMAGE_SCN_MEM_EXECUTE: continue - data = section.get_data() + data = memoryview(section.get_data()) total_pairs_count += len(data) - 1 a = np.frombuffer(data[:-1], dtype=np.uint8).astype(np.uint16)