Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
c1605f9
Initial Neon implementation using sse2neon
ZeroMemes Jun 14, 2026
39f9201
Ignore sse2neon warnings
ZeroMemes Jun 14, 2026
8df8dfc
Missing Mode name
ZeroMemes Jun 14, 2026
17ae035
Implementation throughput comparison benchmark
ZeroMemes Jun 15, 2026
ee66f43
Log processor name
ZeroMemes Jun 15, 2026
77fbf3e
Only build `libhat_benchmark_compare_impl`
ZeroMemes Jun 15, 2026
71fd67a
Neon substitutions
ZeroMemes Jun 15, 2026
d23f6da
Architecture guard + use time defaults
ZeroMemes Jun 15, 2026
c002976
Avoid double movemask
ZeroMemes Jun 15, 2026
aa6d5cc
Use 4-bit mask
ZeroMemes Jun 15, 2026
c7a5b5e
Fix warning
ZeroMemes Jun 15, 2026
4ff162a
Increase benchmark time
ZeroMemes Jun 15, 2026
2013e92
Remove sse2neon
ZeroMemes Jun 15, 2026
2fa461d
Missing neon header include
ZeroMemes Jun 15, 2026
1e42ef4
Improve neon intrinsic type conformance
ZeroMemes Jun 15, 2026
826b640
Hopefully resolve remaining macOS compile errors
ZeroMemes Jun 15, 2026
82a1517
Compile Windows on ARM benchmark using Clang
ZeroMemes Jun 15, 2026
ce4768e
Test manual bitwise optimization
ZeroMemes Jun 15, 2026
02dd07d
Merge branch 'master' into feat/neon
ZeroMemes Jun 15, 2026
16f2eed
Benchmark ARM Neon against Chromium
ZeroMemes Jun 15, 2026
af7e14f
Use cache for benchmark workflow
ZeroMemes Jun 15, 2026
45a85a2
Perhaps
ZeroMemes Jun 15, 2026
af24489
Extract masks using `vget_lane_u64`
ZeroMemes Jun 17, 2026
bab92dc
Merge branch 'master' into feat/neon
ZeroMemes Jun 17, 2026
3699104
Debug mode optimizations for NEON
ZeroMemes Jun 17, 2026
2169688
Revert unrolling
ZeroMemes Jun 17, 2026
df8bb55
Add macOS to ARM64 testing
ZeroMemes Jun 18, 2026
36b05c7
Partial mach-o Process
Imrglop May 31, 2026
ac4712e
Missing source file
ZeroMemes Jun 18, 2026
7e7c29d
Missing include for `std::abort`
ZeroMemes Jun 18, 2026
afc7c1d
Possibly fix macOS process APIs
ZeroMemes Jun 18, 2026
3cec491
Add Linux to ARM64 testing
ZeroMemes Jun 18, 2026
7663ec0
Fix Linux compile
ZeroMemes Jun 18, 2026
bf19f53
Actually fix compile + adjust workflow
ZeroMemes Jun 18, 2026
3671c7f
Implement `system_info_arm` on Linux
ZeroMemes Jun 18, 2026
b747018
Adjust mask iteration
ZeroMemes Jun 19, 2026
9ae6cfc
ALLEGEDLY I can't use `_CountTrailingZeros64`
ZeroMemes Jun 19, 2026
5ddba36
Optimize `frequency.py` using NumPy
ZeroMemes Jun 20, 2026
3d1e9ea
Avoid `bytes` copy with `memoryview`
ZeroMemes Jun 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Benchmark

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
windows:
strategy:
matrix:
toolset: [ v145, ClangCL ]
runs-on: windows-11-vs2026-arm
steps:
- uses: actions/checkout@v6

- name: CPM Cache
uses: actions/cache@v5
with:
path: ${{github.workspace}}/.cpmcache
key: cpm-${{runner.os}}-ARM64-${{hashFiles('test/CMakeLists.txt')}}
restore-keys: |
cpm-${{runner.os}}-ARM64-

- name: Check Hardware
run: (Get-CimInstance Win32_Processor).Name

- name: Locate Visual Studio
run: |
$vsInstall = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
echo "VS_INSTALL_DIR=$vsInstall" >> $env:GITHUB_ENV

- name: Configure
run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=23 -DLIBHAT_SHARED_C_LIB=ON -DLIBHAT_TESTING_SDE=OFF -A ARM64 -T ${{matrix.toolset}}

- name: Build
run: cmake --build ${{github.workspace}}/build -j 4 --config Release --target libhat_benchmark_compare_impl libhat_benchmark_chromium

- name: Test
working-directory: ${{github.workspace}}/build
shell: cmd
run: |
call "${{env.VS_INSTALL_DIR}}\VC\Auxiliary\Build\vcvarsarm64.bat"
ctest --verbose -C Release -R "(libhat_benchmark_compare_impl|libhat_benchmark_chromium)"
26 changes: 22 additions & 4 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,38 @@ jobs:
steps:
- uses: actions/checkout@v6

- name: CPM Cache
uses: actions/cache@v5
with:
path: ${{github.workspace}}/.cpmcache
key: cpm-${{runner.os}}-${{hashFiles('test/CMakeLists.txt')}}
restore-keys: |
cpm-${{runner.os}}-

- name: Configure
run: cmake -B ${{github.workspace}}/build -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING=OFF
run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF -DLIBHAT_TESTING_SDE=OFF

- name: Build
run: cmake --build ${{github.workspace}}/build -j 4

- name: Test
working-directory: ${{github.workspace}}/build
run: ctest --verbose -C ${{env.BUILD_TYPE}} -R libhat_test_.*

linux:
strategy:
matrix:
target: [ x64, ARM64 ]
cxx_standard: [ 20, 23 ]
compiler:
- { pkg: g++, exe: g++, version: 14 }
- { pkg: clang, exe: clang++, version: 18 }
cxx_standard: [ 20, 23 ]
runs-on: ubuntu-24.04
include:
- target: x64
os: ubuntu-26.04
- target: ARM64
os: ubuntu-26.04-arm
runs-on: ${{matrix.os}}
steps:
- uses: actions/checkout@v6

Expand All @@ -51,7 +69,7 @@ jobs:
- name: Configure
env:
CXX: ${{matrix.compiler.exe}}-${{matrix.compiler.version}}
run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING=ON -DLIBHAT_TESTING_SAMPLE_BIN=OFF
run: cmake -B ${{github.workspace}}/build -DCPM_SOURCE_CACHE=${{github.workspace}}/.cpmcache -DCMAKE_CXX_STANDARD=${{matrix.cxx_standard}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DLIBHAT_TESTING_SDE=${{startsWith(matrix.target, 'ARM') && 'OFF' || 'ON'}} -DLIBHAT_TESTING_SAMPLE_BIN=OFF

- name: Build
run: cmake --build ${{github.workspace}}/build -j 4
Expand Down
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ set(LIBHAT_SRC
src/Scanner.cpp
src/System.cpp

src/os/mac/Process.cpp

src/os/linux/MemoryProtector.cpp
src/os/linux/Process.cpp

Expand All @@ -68,7 +70,9 @@ set(LIBHAT_SRC
src/arch/x86/AVX512.cpp
src/arch/x86/System.cpp

src/arch/arm/System.cpp)
src/arch/arm/Neon.cpp
src/arch/arm/System.cpp
)

add_library(libhat STATIC ${LIBHAT_SRC})
add_library(libhat::libhat ALIAS libhat)
Expand Down
1 change: 1 addition & 0 deletions include/libhat/scanner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ namespace hat::detail {
SSE, // x86/x64 SSE 4.1
AVX2, // x86/x64 AVX2
AVX512, // x64 AVX512
Neon, // ARMv7+ Neon
};

class scan_context {
Expand Down
5 changes: 4 additions & 1 deletion include/libhat/system.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,11 @@ LIBHAT_EXPORT namespace hat {
LIBHAT_EXPORT namespace hat {

struct system_info_arm : hat::system_info {
struct {
bool neon;
} extensions{};
private:
system_info_arm() = default;
system_info_arm();
friend const system_info_arm& get_system();
static const system_info_arm instance;
};
Expand Down
13 changes: 9 additions & 4 deletions scripts/frequency.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
import itertools
import sys

import numpy as np
import pefile


def main():
files = [pefile.PE(path, fast_load=True) for path in sys.argv[1:]]

pair_counts = [0 for _ in range(2 ** 16)]
pair_counts = np.zeros(1 << 16, dtype=np.int64)
total_pairs_count = 0

for pe in files:
for section in pe.sections:
if not section.IMAGE_SCN_MEM_EXECUTE:
continue
data = section.get_data()
data = memoryview(section.get_data())
total_pairs_count += len(data) - 1
for a, b in zip(data[:], data[1:]):
pair_counts[a * 0x100 + b] += 1

a = np.frombuffer(data[:-1], dtype=np.uint8).astype(np.uint16)
b = np.frombuffer(data[1:], dtype=np.uint8)
pairs = (a << 8) | b
count = np.bincount(pairs, minlength=1 << 16)
pair_counts += count

top_n_pairs = 512
sorted_pairs = sorted(
Expand Down
Binary file modified scripts/requirements.txt
Binary file not shown.
7 changes: 6 additions & 1 deletion src/Scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ namespace hat::detail {

template<>
scan_function_t resolve_scanner<scan_mode::Auto>(scan_context& context) {
#if defined(LIBHAT_X86) || defined(LIBHAT_X86_64)
const auto& ext = get_system().extensions;
#if defined(LIBHAT_X86) || defined(LIBHAT_X86_64)
if (ext.bmi) {
#if defined(LIBHAT_X86_64) && !defined(LIBHAT_DISABLE_AVX512)
if (ext.avx512f && ext.avx512bw) {
Expand All @@ -83,6 +83,11 @@ namespace hat::detail {
return resolve_scanner<scan_mode::SSE>(context);
}
#endif
#endif
#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64)
if (ext.neon) {
return resolve_scanner<scan_mode::Neon>(context);
}
#endif
// If none of the vectorized implementations are available/supported, then fallback to scanning per-byte
return resolve_scanner<scan_mode::Single>(context);
Expand Down
150 changes: 150 additions & 0 deletions src/arch/arm/Neon.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#include <libhat/defines.hpp>

#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64)

#include <libhat/scanner.hpp>

#include <arm_neon.h>

#ifdef _MSC_VER
#include <intrin.h>

namespace hat::detail {
inline unsigned long bsf(unsigned __int64 num) noexcept {
unsigned long offset;
_BitScanForward64(&offset, num);
return offset;
}
}

#define LIBHAT_BSF64(num) hat::detail::bsf(num)
#else
#define LIBHAT_BSF64(num) __builtin_ctzll(num)
#endif

namespace hat::detail {

inline void load_signature_128(const signature_view signature, uint8x16_t& bytes, uint8x16_t& mask) {
uint8_t byteBuffer[16]{}; // The remaining signature bytes
uint8_t maskBuffer[16]{}; // A bitmask for the signature bytes we care about
for (size_t i = 0; i < signature.size(); i++) {
byteBuffer[i] = std::to_integer<uint8_t>(signature[i].value());
maskBuffer[i] = std::to_integer<uint8_t>(signature[i].mask());
}
bytes = vld1q_u8(static_cast<const uint8_t*>(byteBuffer));
mask = vld1q_u8(static_cast<const uint8_t*>(maskBuffer));
}

template<scan_alignment alignment>
LIBHAT_FORCEINLINE consteval uint64_t create_alignment_mask_neon() {
uint64_t mask{};
for (size_t i = 0; i < 16; i += alignment_stride<alignment>) {
mask |= (static_cast<uint64_t>(0xF) << (i * 4));
}
return mask;
}

template<scan_alignment alignment, bool cmpeq2, bool veccmp>
const_scan_result find_pattern_neon(const std::byte* begin, const std::byte* end, const scan_context& context) {
const auto signature = context.signature;
const auto cmpIndex = cmpeq2 ? *context.pairIndex : context.cmpIndex;

// 128 bit vector containing first signature byte repeated
const auto firstByte = vdupq_n_u8(static_cast<uint8_t>(*signature[cmpIndex]));

uint8x16_t secondByte;
if constexpr (cmpeq2) {
secondByte = vdupq_n_u8(static_cast<uint8_t>(*signature[cmpIndex + 1]));
}

uint8x16_t signatureBytes, signatureMask;
if constexpr (veccmp) {
load_signature_128(signature, signatureBytes, signatureMask);
}

auto [pre, vec, post] = segment_scan<uint8x16_t, veccmp>(begin, end, signature.size(), cmpIndex);

if (!pre.empty()) {
const auto result = find_pattern_single<alignment>(pre.data(), pre.data() + pre.size(), context);
if (result.has_result()) {
return result;
}
}

const auto vec_begin = std::to_address(vec.begin());
const auto vec_end = std::to_address(vec.end());
for (auto it = vec_begin; it != vec_end; it++) {
auto cmp = vceqq_u8(firstByte, vld1q_u8(reinterpret_cast<const uint8_t*>(it)));

if constexpr (cmpeq2) {
const auto cmp2 = vceqq_u8(secondByte, vld1q_u8(reinterpret_cast<const uint8_t*>(it) + 1));
cmp = vandq_u8(cmp, cmp2);
}

auto mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4)), 0);
if constexpr (alignment != scan_alignment::X1) {
mask &= std::rotl(create_alignment_mask_neon<alignment>(), static_cast<int>(cmpIndex) * 4);
}

while (mask) {
const auto offset = LIBHAT_BSF64(mask);
const auto i = reinterpret_cast<const std::byte*>(it) + (offset >> 2) - cmpIndex;
if constexpr (veccmp) {
const auto data = vld1q_u8(reinterpret_cast<const uint8_t*>(i));
const auto neqBits = veorq_u8(data, signatureBytes);
const auto match = vreinterpretq_u64_u8(vandq_u8(neqBits, signatureMask));
if (!(vgetq_lane_u64(match, 0) | vgetq_lane_u64(match, 1))) LIBHAT_UNLIKELY {
return i;
}
} else {
const auto match = std::equal(signature.begin(), signature.end(), i);
if (match) LIBHAT_UNLIKELY {
return i;
}
}
// thanks msvc?
// mask &= ~(0xF * (mask & (~mask + 1)));
mask ^= (uint64_t{0xF} << offset);
}
}

if (!post.empty()) {
return find_pattern_single<alignment>(post.data(), post.data() + post.size(), context);
}
return {};
}

template<>
scan_function_t resolve_scanner<scan_mode::Neon>(scan_context& context) {
context.apply_hints({.vectorSize = 16});

const auto alignment = context.alignment;
const auto signature = context.signature;
const bool cmpeq2 = context.pairIndex.has_value();
const bool veccmp = signature.size() <= 16;

if (alignment == scan_alignment::X1) {
if (cmpeq2 && veccmp) {
return &find_pattern_neon<scan_alignment::X1, true, true>;
} else if (cmpeq2) {
return &find_pattern_neon<scan_alignment::X1, true, false>;
} else if (veccmp) {
return &find_pattern_neon<scan_alignment::X1, false, true>;
} else {
return &find_pattern_neon<scan_alignment::X1, false, false>;
}
} else if (alignment == scan_alignment::X16) {
if (cmpeq2 && veccmp) {
return &find_pattern_neon<scan_alignment::X16, true, true>;
} else if (cmpeq2) {
return &find_pattern_neon<scan_alignment::X16, true, false>;
} else if (veccmp) {
return &find_pattern_neon<scan_alignment::X16, false, true>;
} else {
return &find_pattern_neon<scan_alignment::X16, false, false>;
}
}
LIBHAT_UNREACHABLE();
}
}
#endif
25 changes: 24 additions & 1 deletion src/arch/arm/System.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,32 @@
#include <libhat/defines.hpp>
#ifdef LIBHAT_ARM
#if defined(LIBHAT_ARM) || defined(LIBHAT_AARCH64)

#include <libhat/system.hpp>

#if defined(LIBHAT_WINDOWS) || defined(LIBHAT_MAC)
namespace hat {
system_info_arm::system_info_arm() {
this->extensions.neon = true;
}
}
#endif

#if defined(LIBHAT_LINUX)

#include <asm/hwcap.h>
#include <sys/auxv.h>

namespace hat {
system_info_arm::system_info_arm() {
#if defined(LIBHAT_ARM)
unsigned long hwcap = getauxval(AT_HWCAP);
this->extensions.neon = (hwcap & HWCAP_NEON) != 0;
#else // AARCH64
unsigned long hwcap = getauxval(AT_HWCAP);
this->extensions.neon = (hwcap & HWCAP_ASIMD) != 0;
#endif
}
}
#endif

#endif
Loading