From 9c3806e280c6656e71a740ac187ac9fb439a39b7 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 11:11:06 -0400
Subject: [PATCH 1/7] Don't force-inline StridedView setup/view-construction
 helpers

`permutedims`, `sreshape`, the SliceIndex `getindex`/`sview` view constructors,
and the `_computeviewsize`/`_computeviewstrides`/`_computeviewoffset` helpers are
all "once-per-operation" setup steps, not hot inner-loop code. Forcing `@inline`
on them duplicated their per-N size/stride/offset/permute computation into every
downstream specialization and re-inferred it per shape, bloating compile times.

Dropping `@inline` lets each compile once per signature and dedup across callers.
The hot indexing path is deliberately left inlined: scalar `getindex`/`setindex!`
and `_computeind` keep `@inline`, as does the trivial `_normalizeparent` accessor.

Measured (Julia 1.12.6):
- Downstream TensorOperations dynamic-ncon grid: TTFX 42.1s -> 31.6s (-25%) from
  de-inlining permutedims/sreshape, with no runtime regression (StridedBLAS vs
  BaseCopy results agree to 3e-16).
- StridedViews-local A/B vs origin/main: view construction 4.35ns -> 4.35ns and
  the scalar getindex hot loop 20.22us -> 20.24us, i.e. steady-state runtime
  unchanged for the additionally de-inlined view helpers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/auxiliary.jl   |  6 +++---
 src/stridedview.jl | 15 +++++++++------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/auxiliary.jl b/src/auxiliary.jl
index 0b19941..b36af18 100644
--- a/src/auxiliary.jl
+++ b/src/auxiliary.jl
@@ -58,7 +58,7 @@ end
 #------------------------------
 # Compute the new dimensions of a strided view given the original size and the view slicing
 # indices
-@inline function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N}
+function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N}
     if isa(I[1], Int)
         return _computeviewsize(tail(oldsize), tail(I))
     elseif isa(I[1], Colon)
@@ -71,7 +71,7 @@ _computeviewsize(::Tuple{}, ::Tuple{}) = ()
 
 # Compute the new strides of a (strided) view given the original strides and the view
 # slicing indices
-@inline function _computeviewstrides(
+function _computeviewstrides(
         oldstrides::NTuple{N, Int},
         I::NTuple{N, SliceIndex}
     ) where {N}
@@ -90,7 +90,7 @@ _computeviewstrides(::Tuple{}, ::Tuple{}) = ()
 
 # Compute the additional offset of a (strided) view given the original strides and the view
 # slicing indices
-@inline function _computeviewoffset(
+function _computeviewoffset(
         strides::NTuple{N, Int},
         I::NTuple{N, SliceIndex}
     ) where {N}
diff --git a/src/stridedview.jl b/src/stridedview.jl
index b745574..4d8a73a 100644
--- a/src/stridedview.jl
+++ b/src/stridedview.jl
@@ -138,8 +138,11 @@ end
     return a
 end
 
-# Indexing with slice indices to create a new view
-@inline function Base.getindex(a::StridedView{T, N}, I::Vararg{SliceIndex, N}) where {T, N}
+# Indexing with slice indices to create a new view.
+# This builds a *new* view (a once-per-operation setup step, not a hot inner loop), so we
+# deliberately do not force-inline it: `@inline` here duplicated the per-N size/stride/offset
+# computation into every downstream caller instead of compiling it once per signature.
+function Base.getindex(a::StridedView{T, N}, I::Vararg{SliceIndex, N}) where {T, N}
     return StridedView{T}(
         a.parent,
         _computeviewsize(a.size, I),
@@ -179,7 +182,7 @@ function Base.conj(a::StridedView)
     return StridedView{T}(a.parent, a.size, a.strides, a.offset, newop)
 end
 
-@inline function Base.permutedims(a::StridedView{T, N}, p) where {T, N}
+function Base.permutedims(a::StridedView{T, N}, p) where {T, N}
     _isperm(N, p) || throw(ArgumentError("Invalid permutation of length $N: $p"))
     newsize = ntuple(n -> size(a, p[n]), Val(N))
     newstrides = ntuple(n -> stride(a, p[n]), Val(N))
@@ -228,10 +231,10 @@ sview(a::StridedView, I::SliceIndex) = getindex(sreshape(a, (length(a),)), I)
 Base.view(a::StridedView{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} = getindex(a, I...)
 
 # `sview` can be used as a constructor when acting on `AbstractArray` objects
-@inline function sview(a::AbstractArray{<:Any, N}, I::Vararg{SliceIndex, N}) where {N}
+function sview(a::AbstractArray{<:Any, N}, I::Vararg{SliceIndex, N}) where {N}
     return getindex(StridedView(a), I...)
 end
-@inline function sview(a::AbstractArray, I::SliceIndex)
+function sview(a::AbstractArray, I::SliceIndex)
     return getindex(sreshape(StridedView(a), (length(a),)), I)
 end
 
@@ -251,7 +254,7 @@ end
 # we cannot use Base.reshape, as this also accepts indices that might not preserve
 # stridedness
 sreshape(a, args::Vararg{Int}) = sreshape(a, args)
-@inline function sreshape(a::StridedView{T}, newsize::Dims) where {T}
+function sreshape(a::StridedView{T}, newsize::Dims) where {T}
     if any(isequal(0), newsize)
         any(isequal(0), size(a)) || throw(DimensionMismatch())
         newstrides = one.(newsize)

From f83ec7b6f46a06d34f7e382bfa092b82ad2dade6 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 11:11:16 -0400
Subject: [PATCH 2/7] Add a small PrecompileTools workload for core StridedView
 ops

Warm the core `StridedView` specializations for the BLAS element types
(`Float32`, `Float64`, `ComplexF32`, `ComplexF64`) over ndims 1:4 plus the 2D
transpose/adjoint cases: construction, `permutedims`, `sreshape`, `sview`/slice
`getindex`, `conj`, `transpose`/`adjoint`, and `size`/`strides`/`offset`. These
are exactly the specializations downstream packages hit on their first call, so
caching them removes that first-call latency.

The workload is intentionally kept small (BLAS floats, ndims 1:4, identity/conj
plus the 2D wrappers) to keep StridedViews' own precompile bounded.

Measured (Julia 1.12.6, cold compiled-cache depot):
- StridedViews cold precompile: ~0.53s -> ~2.29s (Pkg build line), i.e. ~+1.76s
  one-time, bounded.
- First-call latency of the exercised core ops in a fresh process: ~1.78s ->
  ~0.027s (~66x), the inference cost being moved into the cached precompile.

Bumps version to 0.5.2 and adds PrecompileTools to [deps]/[compat].

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Project.toml        |  4 +++-
 src/StridedViews.jl |  1 +
 src/precompile.jl   | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 src/precompile.jl

diff --git a/Project.toml b/Project.toml
index b2aeba2..f303670 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,11 @@
 name = "StridedViews"
 uuid = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 authors = ["Lukas Devos <lukas.devos@ugent.be>", "Jutho Haegeman <jutho.haegeman@ugent.be>"]
-version = "0.5.1"
+version = "0.5.2"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -28,6 +29,7 @@ CUDACore = "6"
 JLArrays = "0.3.1"
 LinearAlgebra = "1"
 Metal = "1"
+PrecompileTools = "1"
 PtrArrays = "1.2.0"
 julia = "1.10"
 
diff --git a/src/StridedViews.jl b/src/StridedViews.jl
index cfa01a9..dc2e1c2 100644
--- a/src/StridedViews.jl
+++ b/src/StridedViews.jl
@@ -9,5 +9,6 @@ export StridedView, sreshape, sview, isstrided
 
 include("auxiliary.jl")
 include("stridedview.jl")
+include("precompile.jl")
 
 end
diff --git a/src/precompile.jl b/src/precompile.jl
new file mode 100644
index 0000000..797d349
--- /dev/null
+++ b/src/precompile.jl
@@ -0,0 +1,42 @@
+# Precompilation workload
+# ------------------------
+# Cache the core `StridedView` specializations for the BLAS element types over a small
+# range of dimensionalities and the op-wrappers a `StridedView` can carry. These are the
+# specializations that downstream packages (e.g. TensorOperations / Strided) hit on their
+# first call, so warming them here removes that first-call latency.
+#
+# The workload is deliberately kept small (BLAS floats, ndims 1:4, identity/conj plus the
+# 2D transpose/adjoint cases) so that it adds only a bounded amount to StridedViews' own
+# precompile time.
+using PrecompileTools: @setup_workload, @compile_workload
+
+@setup_workload begin
+    @compile_workload begin
+        for T in (Float32, Float64, ComplexF32, ComplexF64)
+            # construction + property queries + core ops for ndims 1:4
+            for N in 1:4
+                A = Array{T, N}(undef, ntuple(_ -> 2, N))
+                sv = StridedView(A)
+                size(sv)
+                strides(sv)
+                offset(sv)
+                conj(sv)
+                # permute through the identity permutation (exercises the per-N path)
+                permutedims(sv, ntuple(identity, N))
+                # reshape to a flat vector and back (also exercises sview on the flat view)
+                flat = sreshape(sv, (length(sv),))
+                sview(flat, 1:length(sv))
+                getindex(sv, ntuple(_ -> 1, N)...)
+            end
+            # 2D matrix wrappers: transpose / adjoint
+            M = Array{T, 2}(undef, 2, 2)
+            svM = StridedView(M)
+            transpose(svM)
+            adjoint(svM)
+            # a representative 4D slice view (the SliceIndex `getindex` construction path)
+            A4 = Array{T, 4}(undef, 2, 2, 2, 2)
+            sv4 = StridedView(A4)
+            getindex(sv4, :, 1:2, 1, 1:2)
+        end
+    end
+end

From 565c833a7deff4856c34ee881878f8e7040ccf9f Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 11:27:29 -0400
Subject: [PATCH 3/7] restore inline for recursive functions

---
 src/auxiliary.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/auxiliary.jl b/src/auxiliary.jl
index b36af18..0b19941 100644
--- a/src/auxiliary.jl
+++ b/src/auxiliary.jl
@@ -58,7 +58,7 @@ end
 #------------------------------
 # Compute the new dimensions of a strided view given the original size and the view slicing
 # indices
-function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N}
+@inline function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N}
     if isa(I[1], Int)
         return _computeviewsize(tail(oldsize), tail(I))
     elseif isa(I[1], Colon)
@@ -71,7 +71,7 @@ _computeviewsize(::Tuple{}, ::Tuple{}) = ()
 
 # Compute the new strides of a (strided) view given the original strides and the view
 # slicing indices
-function _computeviewstrides(
+@inline function _computeviewstrides(
         oldstrides::NTuple{N, Int},
         I::NTuple{N, SliceIndex}
     ) where {N}
@@ -90,7 +90,7 @@ _computeviewstrides(::Tuple{}, ::Tuple{}) = ()
 
 # Compute the additional offset of a (strided) view given the original strides and the view
 # slicing indices
-function _computeviewoffset(
+@inline function _computeviewoffset(
         strides::NTuple{N, Int},
         I::NTuple{N, SliceIndex}
     ) where {N}

From 6937561d331345240c0810ce9765b1cea30b775a Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 11:27:41 -0400
Subject: [PATCH 4/7] increase precompile workload

---
 src/precompile.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/precompile.jl b/src/precompile.jl
index 797d349..2aae20c 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -4,29 +4,29 @@
 # range of dimensionalities and the op-wrappers a `StridedView` can carry. These are the
 # specializations that downstream packages (e.g. TensorOperations / Strided) hit on their
 # first call, so warming them here removes that first-call latency.
-#
-# The workload is deliberately kept small (BLAS floats, ndims 1:4, identity/conj plus the
-# 2D transpose/adjoint cases) so that it adds only a bounded amount to StridedViews' own
-# precompile time.
 using PrecompileTools: @setup_workload, @compile_workload
 
 @setup_workload begin
     @compile_workload begin
         for T in (Float32, Float64, ComplexF32, ComplexF64)
             # construction + property queries + core ops for ndims 1:4
-            for N in 1:4
+            for N in 1:6
                 A = Array{T, N}(undef, ntuple(_ -> 2, N))
                 sv = StridedView(A)
                 size(sv)
                 strides(sv)
                 offset(sv)
-                conj(sv)
+                csv = conj(sv)
                 # permute through the identity permutation (exercises the per-N path)
                 permutedims(sv, ntuple(identity, N))
+                permutedims(csv, ntuple(identity, N))
                 # reshape to a flat vector and back (also exercises sview on the flat view)
                 flat = sreshape(sv, (length(sv),))
                 sview(flat, 1:length(sv))
                 getindex(sv, ntuple(_ -> 1, N)...)
+                flat = sreshape(csv, (length(sv),))
+                sview(flat, 1:length(csv))
+                getindex(csv, ntuple(_ -> 1, N)...)
             end
             # 2D matrix wrappers: transpose / adjoint
             M = Array{T, 2}(undef, 2, 2)

From c798328c48b39b572410363c568a1c412e8013fe Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 11:27:49 -0400
Subject: [PATCH 5/7] remove slop

---
 src/stridedview.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/stridedview.jl b/src/stridedview.jl
index 4d8a73a..b7e93c0 100644
--- a/src/stridedview.jl
+++ b/src/stridedview.jl
@@ -139,9 +139,6 @@ end
 end
 
 # Indexing with slice indices to create a new view.
-# This builds a *new* view (a once-per-operation setup step, not a hot inner loop), so we
-# deliberately do not force-inline it: `@inline` here duplicated the per-N size/stride/offset
-# computation into every downstream caller instead of compiling it once per signature.
 function Base.getindex(a::StridedView{T, N}, I::Vararg{SliceIndex, N}) where {T, N}
     return StridedView{T}(
         a.parent,
@@ -251,8 +248,7 @@ function Base.show(io::IO, e::ReshapeException)
     return print(io, msg)
 end
 
-# we cannot use Base.reshape, as this also accepts indices that might not preserve
-# stridedness
+# we cannot use Base.reshape, as this also accepts indices that might not preserve stridedness
 sreshape(a, args::Vararg{Int}) = sreshape(a, args)
 function sreshape(a::StridedView{T}, newsize::Dims) where {T}
     if any(isequal(0), newsize)

From 4c04cf9634bba534bf3405bb918b1f497bfd32db Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 11:40:45 -0400
Subject: [PATCH 6/7] bump precompiletools version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f303670..68fd3cf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -29,7 +29,7 @@ CUDACore = "6"
 JLArrays = "0.3.1"
 LinearAlgebra = "1"
 Metal = "1"
-PrecompileTools = "1"
+PrecompileTools = "1.1"
 PtrArrays = "1.2.0"
 julia = "1.10"
 

From 879f83c447b2e1307bf907f9ce875e7f278deff4 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 17 Jun 2026 12:46:40 -0400
Subject: [PATCH 7/7] attempt to fix JET

---
 test/jet/Project.toml | 3 ---
 test/jet/jet.jl       | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/jet/Project.toml b/test/jet/Project.toml
index 0c19e5a..68090c6 100644
--- a/test/jet/Project.toml
+++ b/test/jet/Project.toml
@@ -4,8 +4,5 @@ name = "StridedViewsJETTest"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 
-[sources]
-StridedViews = {path = "../.."}
-
 [compat]
 JET = "0.9, 0.10, 0.11"
diff --git a/test/jet/jet.jl b/test/jet/jet.jl
index 2477de8..4955531 100644
--- a/test/jet/jet.jl
+++ b/test/jet/jet.jl
@@ -2,6 +2,7 @@
     import Pkg
     try
         Pkg.activate(joinpath(@__DIR__); io = devnull)
+        Pkg.develop(Pkg.PackageSpec(path = joinpath(@__DIR__, "..", "..")); io = devnull)
         Pkg.instantiate(; io = devnull)
         @eval import JET
         JET.test_package(StridedViews; target_modules = (StridedViews,))