diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..ed4624b --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,45 @@ +name: Benchmark + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + windows: + strategy: + matrix: + toolset: [ v145, ClangCL ] + runs-on: windows-11-vs2026-arm + steps: + - uses: actions/checkout@v6 + + - name: CPM Cache + uses: actions/cache@v5 + with: + path: ${{github.workspace}}/.cpmcache + key: cpm-${{runner.os}}-ARM64-${{hashFiles('test/CMakeLists.txt')}} + restore-keys: | + cpm-${{runner.os}}-ARM64- + + - name: Check Hardware + run: (Get-CimInstance Win32_Processor).Name + + - name: Locate Visual Studio + run: | + $vsInstall = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath + echo "VS_INSTALL_DIR=$vsInstall" >> $env:GITHUB_ENV + + - name: Configure + run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -A ARM64 -T ${{matrix.toolset}} + + - name: Build + run: cmake --build ${{github.workspace}}/build -j 4 --config Release --target libhat_benchmark_compare_impl libhat_benchmark_chromium + + - name: Test + working-directory: ${{github.workspace}}/build + shell: cmd + run: | + call "${{env.VS_INSTALL_DIR}}\VC\Auxiliary\Build\vcvarsarm64.bat" + ctest --verbose -C Release -R "(libhat_benchmark_compare_impl|libhat_benchmark_chromium)" diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 0b74308..d3e46ce 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -18,20 +18,38 @@ jobs: steps: - uses: actions/checkout@v6 + - name: CPM Cache + uses: actions/cache@v5 + with: + path: ${{github.workspace}}/.cpmcache + key: cpm-${{runner.os}}-${{hashFiles('test/CMakeLists.txt')}} + restore-keys: | + cpm-${{runner.os}}- + - name: Configure - run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING=OFF + run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF -DLIBHAT_TESTING_SDE=OFF - name: Build run: cmake --build ${{github.workspace}}/build -j 4 + - name: Test + working-directory: ${{github.workspace}}/build + run: ctest --verbose -C ${{env.BUILD_TYPE}} -R libhat_test_.* + linux: strategy: matrix: + target: [ x64, ARM64 ] + cxx_standard: [ 20, 23 ] compiler: - { pkg: g++, exe: g++, version: 14 } - { pkg: clang, exe: clang++, version: 18 } - cxx_standard: [ 20, 23 ] - runs-on: ubuntu-24.04 + include: + - target: x64 + os: ubuntu-26.04 + - target: ARM64 + os: ubuntu-26.04-arm + runs-on: ${{matrix.os}} steps: - uses: actions/checkout@v6 @@ -51,7 +69,7 @@ jobs: - name: Configure env: CXX: ${{matrix.compiler.exe}}-${{matrix.compiler.version}} - run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING=ON -DLIBHAT_TESTING_SAMPLE_BIN=OFF + run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SDE=${{startsWith(matrix.target, 'ARM') && 'OFF' || 'ON'}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF - name: Build run: cmake --build ${{github.workspace}}/build -j 4 diff --git a/CMakeLists.txt b/CMakeLists.txt index b667fe5..c232b2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,8 @@ set(LIBHAT_SRC src/Scanner.cpp src/System.cpp + src/os/mac/Process.cpp + src/os/linux/MemoryProtector.cpp src/os/linux/Process.cpp @@ -68,7 +70,9 @@ set(LIBHAT_SRC src/arch/x86/AVX512.cpp src/arch/x86/System.cpp - src/arch/arm/System.cpp) + src/arch/arm/Neon.cpp + src/arch/arm/System.cpp +) add_library(libhat STATIC ${LIBHAT_SRC}) add_library(libhat::libhat ALIAS libhat) diff --git a/include/libhat/scanner.hpp b/include/libhat/scanner.hpp index 1a53668..3d62343 100644 --- a/include/libhat/scanner.hpp +++ b/include/libhat/scanner.hpp @@ -149,6 +149,7 @@ namespace hat::detail { SSE, // x86/x64 SSE 4.1 AVX2, // x86/x64 AVX2 AVX512, // x64 AVX512 + Neon, // ARMv7+ Neon }; class scan_context { diff --git a/include/libhat/system.hpp b/include/libhat/system.hpp index 80c14e8..f13a6cc 100644 --- a/include/libhat/system.hpp +++ b/include/libhat/system.hpp @@ -52,8 +52,11 @@ LIBHAT_EXPORT namespace hat { LIBHAT_EXPORT namespace hat { struct system_info_arm : hat::system_info { + struct { + bool neon; + } extensions{}; private: - system_info_arm() = default; + system_info_arm(); friend const system_info_arm& get_system(); static const system_info_arm instance; }; diff --git a/scripts/frequency.py b/scripts/frequency.py index 9f75422..258fa7e 100644 --- a/scripts/frequency.py +++ b/scripts/frequency.py @@ -1,23 +1,28 @@ import itertools import sys +import numpy as np import pefile def main(): files = [pefile.PE(path, fast_load=True) for path in sys.argv[1:]] - pair_counts = [0 for _ in range(2 ** 16)] + pair_counts = np.zeros(1 << 16, dtype=np.int64) total_pairs_count = 0 for pe in files: for section in pe.sections: if not section.IMAGE_SCN_MEM_EXECUTE: continue - data = section.get_data() + data = memoryview(section.get_data()) total_pairs_count += len(data) - 1 - for a, b in zip(data[:], data[1:]): - pair_counts[a * 0x100 + b] += 1 + + a = np.frombuffer(data[:-1], dtype=np.uint8).astype(np.uint16) + b = np.frombuffer(data[1:], dtype=np.uint8) + pairs = (a << 8) | b + count = np.bincount(pairs, minlength=1 << 16) + pair_counts += count top_n_pairs = 512 sorted_pairs = sorted( diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 04a2c9b..54ea379 100644 Binary files a/scripts/requirements.txt and b/scripts/requirements.txt differ diff --git a/src/Scanner.cpp b/src/Scanner.cpp index fb3e112..04917ba 100644 --- a/src/Scanner.cpp +++ b/src/Scanner.cpp @@ -66,8 +66,8 @@ namespace hat::detail { template<> scan_function_t resolve_scanner(scan_context& context) { -#if defined(LIBHAT_X86) || defined(LIBHAT_X86_64) const auto& ext = get_system().extensions; +#if defined(LIBHAT_X86) || defined(LIBHAT_X86_64) if (ext.bmi) { #if defined(LIBHAT_X86_64) && !defined(LIBHAT_DISABLE_AVX512) if (ext.avx512f && ext.avx512bw) { @@ -83,6 +83,11 @@ namespace hat::detail { return resolve_scanner(context); } #endif +#endif +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) + if (ext.neon) { + return resolve_scanner(context); + } #endif // If none of the vectorized implementations are available/supported, then fallback to scanning per-byte return resolve_scanner(context); diff --git a/src/arch/arm/Neon.cpp b/src/arch/arm/Neon.cpp new file mode 100644 index 0000000..8d65bcf --- /dev/null +++ b/src/arch/arm/Neon.cpp @@ -0,0 +1,150 @@ +#include + +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) + +#include + +#include + +#ifdef _MSC_VER +#include + + namespace hat::detail { + inline unsigned long bsf(unsigned __int64 num) noexcept { + unsigned long offset; + _BitScanForward64(&offset, num); + return offset; + } + } + +#define LIBHAT_BSF64(num) hat::detail::bsf(num) +#else +#define LIBHAT_BSF64(num) __builtin_ctzll(num) +#endif + +namespace hat::detail { + + inline void load_signature_128(const signature_view signature, uint8x16_t& bytes, uint8x16_t& mask) { + uint8_t byteBuffer[16]{}; // The remaining signature bytes + uint8_t maskBuffer[16]{}; // A bitmask for the signature bytes we care about + for (size_t i = 0; i < signature.size(); i++) { + byteBuffer[i] = std::to_integer(signature[i].value()); + maskBuffer[i] = std::to_integer(signature[i].mask()); + } + bytes = vld1q_u8(static_cast(byteBuffer)); + mask = vld1q_u8(static_cast(maskBuffer)); + } + + template + LIBHAT_FORCEINLINE consteval uint64_t create_alignment_mask_neon() { + uint64_t mask{}; + for (size_t i = 0; i < 16; i += alignment_stride) { + mask |= (static_cast(0xF) << (i * 4)); + } + return mask; + } + + template + const_scan_result find_pattern_neon(const std::byte* begin, const std::byte* end, const scan_context& context) { + const auto signature = context.signature; + const auto cmpIndex = cmpeq2 ? *context.pairIndex : context.cmpIndex; + + // 128 bit vector containing first signature byte repeated + const auto firstByte = vdupq_n_u8(static_cast(*signature[cmpIndex])); + + uint8x16_t secondByte; + if constexpr (cmpeq2) { + secondByte = vdupq_n_u8(static_cast(*signature[cmpIndex + 1])); + } + + uint8x16_t signatureBytes, signatureMask; + if constexpr (veccmp) { + load_signature_128(signature, signatureBytes, signatureMask); + } + + auto [pre, vec, post] = segment_scan(begin, end, signature.size(), cmpIndex); + + if (!pre.empty()) { + const auto result = find_pattern_single(pre.data(), pre.data() + pre.size(), context); + if (result.has_result()) { + return result; + } + } + + const auto vec_begin = std::to_address(vec.begin()); + const auto vec_end = std::to_address(vec.end()); + for (auto it = vec_begin; it != vec_end; it++) { + auto cmp = vceqq_u8(firstByte, vld1q_u8(reinterpret_cast(it))); + + if constexpr (cmpeq2) { + const auto cmp2 = vceqq_u8(secondByte, vld1q_u8(reinterpret_cast(it) + 1)); + cmp = vandq_u8(cmp, cmp2); + } + + auto mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4)), 0); + if constexpr (alignment != scan_alignment::X1) { + mask &= std::rotl(create_alignment_mask_neon(), static_cast(cmpIndex) * 4); + } + + while (mask) { + const auto offset = LIBHAT_BSF64(mask); + const auto i = reinterpret_cast(it) + (offset >> 2) - cmpIndex; + if constexpr (veccmp) { + const auto data = vld1q_u8(reinterpret_cast(i)); + const auto neqBits = veorq_u8(data, signatureBytes); + const auto match = vreinterpretq_u64_u8(vandq_u8(neqBits, signatureMask)); + if (!(vgetq_lane_u64(match, 0) | vgetq_lane_u64(match, 1))) LIBHAT_UNLIKELY { + return i; + } + } else { + const auto match = std::equal(signature.begin(), signature.end(), i); + if (match) LIBHAT_UNLIKELY { + return i; + } + } + // thanks msvc? + // mask &= ~(0xF * (mask & (~mask + 1))); + mask ^= (uint64_t{0xF} << offset); + } + } + + if (!post.empty()) { + return find_pattern_single(post.data(), post.data() + post.size(), context); + } + return {}; + } + + template<> + scan_function_t resolve_scanner(scan_context& context) { + context.apply_hints({.vectorSize = 16}); + + const auto alignment = context.alignment; + const auto signature = context.signature; + const bool cmpeq2 = context.pairIndex.has_value(); + const bool veccmp = signature.size() <= 16; + + if (alignment == scan_alignment::X1) { + if (cmpeq2 && veccmp) { + return &find_pattern_neon; + } else if (cmpeq2) { + return &find_pattern_neon; + } else if (veccmp) { + return &find_pattern_neon; + } else { + return &find_pattern_neon; + } + } else if (alignment == scan_alignment::X16) { + if (cmpeq2 && veccmp) { + return &find_pattern_neon; + } else if (cmpeq2) { + return &find_pattern_neon; + } else if (veccmp) { + return &find_pattern_neon; + } else { + return &find_pattern_neon; + } + } + LIBHAT_UNREACHABLE(); + } +} +#endif diff --git a/src/arch/arm/System.cpp b/src/arch/arm/System.cpp index ad76b8f..f6c439a 100644 --- a/src/arch/arm/System.cpp +++ b/src/arch/arm/System.cpp @@ -1,9 +1,32 @@ #include -#ifdef LIBHAT_ARM +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) #include +#if defined(LIBHAT_WINDOWS) || defined(LIBHAT_MAC) namespace hat { + system_info_arm::system_info_arm() { + this->extensions.neon = true; + } +} +#endif + +#if defined(LIBHAT_LINUX) + +#include +#include +namespace hat { + system_info_arm::system_info_arm() { +#if defined(LIBHAT_ARM) + unsigned long hwcap = getauxval(AT_HWCAP); + this->extensions.neon = (hwcap & HWCAP_NEON) != 0; +#else // AARCH64 + unsigned long hwcap = getauxval(AT_HWCAP); + this->extensions.neon = (hwcap & HWCAP_ASIMD) != 0; +#endif + } } #endif + +#endif diff --git a/src/os/mac/Process.cpp b/src/os/mac/Process.cpp new file mode 100644 index 0000000..cb931c4 --- /dev/null +++ b/src/os/mac/Process.cpp @@ -0,0 +1,102 @@ +#include +#ifdef LIBHAT_MAC + +#include + +#include +#include +#include + +#include + +#include + +namespace hat::process { + + hat::process::module get_process_module() { + const uint32_t count = _dyld_image_count(); + for (uint32_t i = 0; i != count; i++) { + const auto* header = reinterpret_cast(_dyld_get_image_header(i)); + if (header && header->filetype == MH_EXECUTE) { + return hat::process::module{std::bit_cast(header)}; + } + } + std::abort(); + } + + // no-op on 32 bit binaries + void module::for_each_segment(const std::function, hat::protection)>& callback) const { + const uint32_t imageCount = _dyld_image_count(); + for (uint32_t i = 0; i < imageCount; i++) { + const auto* header = reinterpret_cast(_dyld_get_image_header(i)); + if (header == nullptr) { + continue; + } + if (std::bit_cast(header) != this->address()) { + continue; + } + + const auto slide = static_cast(_dyld_get_image_vmaddr_slide(i)); + const auto* cmd = reinterpret_cast( + reinterpret_cast(header) + sizeof(mach_header_64)); + + for (uint32_t j = 0; j < header->ncmds; j++) { + if (cmd->cmd == LC_SEGMENT_64) { + const auto* seg = reinterpret_cast(cmd); + + // skip __PAGEZERO and any unmapped segment + if (seg->vmsize != 0 && seg->initprot != 0) { + const std::span data{ + reinterpret_cast(seg->vmaddr + slide), + seg->vmsize + }; + + hat::protection prot{}; + if (seg->initprot & VM_PROT_READ) prot |= hat::protection::Read; + if (seg->initprot & VM_PROT_WRITE) prot |= hat::protection::Write; + if (seg->initprot & VM_PROT_EXECUTE) prot |= hat::protection::Execute; + + if (!callback(data, prot)) { + return; + } + } + } + cmd = reinterpret_cast( + reinterpret_cast(cmd) + cmd->cmdsize); + } + return; + } + } + + std::optional get_module(const std::string_view name) { + if (name.empty()) { + return get_process_module(); + } + + using Handle = std::unique_ptr; + + const std::string buffer{name}; + const Handle handle{dlopen(buffer.c_str(), RTLD_LAZY | RTLD_NOLOAD)}; + if (!handle) { + return {}; + } + + const uint32_t count = _dyld_image_count(); + for (uint32_t i = 0; i < count; i++) { + const auto* header = reinterpret_cast(_dyld_get_image_header(i)); + if (header == nullptr) + continue; + + const Handle h{dlopen(_dyld_get_image_name(i), RTLD_LAZY | RTLD_NOLOAD)}; + if (h == handle) { + return hat::process::module{std::bit_cast(header)}; + } + } + + return {}; + } +} + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fda2f3a..9ef35d7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -59,6 +59,7 @@ function(register_test NAME SOURCE) endfunction() register_test(libhat_benchmark_compare benchmark/Compare.cpp) +register_test(libhat_benchmark_compare_impl benchmark/CompareImpl.cpp) register_test(libhat_test_scanner tests/Scanner.cpp) register_test(libhat_test_process tests/Process.cpp) diff --git a/test/benchmark/CompareImpl.cpp b/test/benchmark/CompareImpl.cpp new file mode 100644 index 0000000..a7d1250 --- /dev/null +++ b/test/benchmark/CompareImpl.cpp @@ -0,0 +1,49 @@ +#include + +#include +#include + +static constexpr std::string_view test_pattern = "01 02 03 04 05 06 07 08 09"; + +static auto gen_random_buffer(const size_t size) { + std::vector buffer(size); + std::default_random_engine generator(123); + std::uniform_int_distribution distribution(0, 0xFFFFFFFFFFFFFFFF); + for (size_t i = 0; i < buffer.size(); i += 8) { + uint64_t value = distribution(generator); + std::memcpy(&buffer[i], &value, sizeof(value)); + } + return buffer; +} + +template +static void BM_Throughput(benchmark::State& state) { + const size_t size = state.range(0); + const auto buf = gen_random_buffer(size); + const auto begin = std::to_address(buf.begin()); + const auto end = std::to_address(buf.end()); + + const auto sig = hat::parse_signature(test_pattern).value(); + const auto ctx = hat::detail::scan_context::create(sig, hat::scan_alignment::X1, hat::scan_hint::none); + for (auto _ : state) { + benchmark::DoNotOptimize(ctx.scan(begin, end)); + } + state.SetBytesProcessed(static_cast(state.iterations() * size)); +} + +static constexpr int64_t rangeStart = 1 << 22; // 4 MiB +static constexpr int64_t rangeLimit = 1 << 28; // 256 MiB + +#define LIBHAT_BENCHMARK(...) BENCHMARK(__VA_ARGS__) \ + ->Threads(1) \ + ->MinWarmUpTime(1) \ + ->MinTime(2) \ + ->Range(rangeStart, rangeLimit) \ + ->UseRealTime(); + +LIBHAT_BENCHMARK(BM_Throughput); +#if defined(LIBHAT_AARCH64) || defined(LIBHAT_ARM) +LIBHAT_BENCHMARK(BM_Throughput); +#endif + +BENCHMARK_MAIN(); diff --git a/test/tests/Scanner.cpp b/test/tests/Scanner.cpp index 80f26e1..483b2d6 100644 --- a/test/tests/Scanner.cpp +++ b/test/tests/Scanner.cpp @@ -83,6 +83,14 @@ using FindPatternTestTypes = ::testing::Types< FindPatternParameters, FindPatternParameters, FindPatternParameters, +#endif +#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64) + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, + FindPatternParameters, #endif FindPatternParameters, FindPatternParameters, @@ -106,6 +114,7 @@ class FindPatternTestNameGenerator { else if constexpr (Mode == hat::detail::scan_mode::SSE) return "SSE"; else if constexpr (Mode == hat::detail::scan_mode::AVX2) return "AVX2"; else if constexpr (Mode == hat::detail::scan_mode::AVX512) return "AVX512"; + else if constexpr (Mode == hat::detail::scan_mode::Neon) return "Neon"; else static_assert(sizeof(Mode) == 0); } };