From 9c3806e280c6656e71a740ac187ac9fb439a39b7 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 11:11:06 -0400 Subject: [PATCH 1/7] Don't force-inline StridedView setup/view-construction helpers `permutedims`, `sreshape`, the SliceIndex `getindex`/`sview` view constructors, and the `_computeviewsize`/`_computeviewstrides`/`_computeviewoffset` helpers are all "once-per-operation" setup steps, not hot inner-loop code. Forcing `@inline` on them duplicated their per-N size/stride/offset/permute computation into every downstream specialization and re-inferred it per shape, bloating compile times. Dropping `@inline` lets each compile once per signature and dedup across callers. The hot indexing path is deliberately left inlined: scalar `getindex`/`setindex!` and `_computeind` keep `@inline`, as does the trivial `_normalizeparent` accessor. Measured (Julia 1.12.6): - Downstream TensorOperations dynamic-ncon grid: TTFX 42.1s -> 31.6s (-25%) from de-inlining permutedims/sreshape, with no runtime regression (StridedBLAS vs BaseCopy results agree to 3e-16). - StridedViews-local A/B vs origin/main: view construction 4.35ns -> 4.35ns and the scalar getindex hot loop 20.22us -> 20.24us, i.e. steady-state runtime unchanged for the additionally de-inlined view helpers. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/auxiliary.jl | 6 +++--- src/stridedview.jl | 15 +++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/auxiliary.jl b/src/auxiliary.jl index 0b19941..b36af18 100644 --- a/src/auxiliary.jl +++ b/src/auxiliary.jl @@ -58,7 +58,7 @@ end #------------------------------ # Compute the new dimensions of a strided view given the original size and the view slicing # indices -@inline function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N} +function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N} if isa(I[1], Int) return _computeviewsize(tail(oldsize), tail(I)) elseif isa(I[1], Colon) @@ -71,7 +71,7 @@ _computeviewsize(::Tuple{}, ::Tuple{}) = () # Compute the new strides of a (strided) view given the original strides and the view # slicing indices -@inline function _computeviewstrides( +function _computeviewstrides( oldstrides::NTuple{N, Int}, I::NTuple{N, SliceIndex} ) where {N} @@ -90,7 +90,7 @@ _computeviewstrides(::Tuple{}, ::Tuple{}) = () # Compute the additional offset of a (strided) view given the original strides and the view # slicing indices -@inline function _computeviewoffset( +function _computeviewoffset( strides::NTuple{N, Int}, I::NTuple{N, SliceIndex} ) where {N} diff --git a/src/stridedview.jl b/src/stridedview.jl index b745574..4d8a73a 100644 --- a/src/stridedview.jl +++ b/src/stridedview.jl @@ -138,8 +138,11 @@ end return a end -# Indexing with slice indices to create a new view -@inline function Base.getindex(a::StridedView{T, N}, I::Vararg{SliceIndex, N}) where {T, N} +# Indexing with slice indices to create a new view. +# This builds a *new* view (a once-per-operation setup step, not a hot inner loop), so we +# deliberately do not force-inline it: `@inline` here duplicated the per-N size/stride/offset +# computation into every downstream caller instead of compiling it once per signature. +function Base.getindex(a::StridedView{T, N}, I::Vararg{SliceIndex, N}) where {T, N} return StridedView{T}( a.parent, _computeviewsize(a.size, I), @@ -179,7 +182,7 @@ function Base.conj(a::StridedView) return StridedView{T}(a.parent, a.size, a.strides, a.offset, newop) end -@inline function Base.permutedims(a::StridedView{T, N}, p) where {T, N} +function Base.permutedims(a::StridedView{T, N}, p) where {T, N} _isperm(N, p) || throw(ArgumentError("Invalid permutation of length $N: $p")) newsize = ntuple(n -> size(a, p[n]), Val(N)) newstrides = ntuple(n -> stride(a, p[n]), Val(N)) @@ -228,10 +231,10 @@ sview(a::StridedView, I::SliceIndex) = getindex(sreshape(a, (length(a),)), I) Base.view(a::StridedView{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} = getindex(a, I...) # `sview` can be used as a constructor when acting on `AbstractArray` objects -@inline function sview(a::AbstractArray{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} +function sview(a::AbstractArray{<:Any, N}, I::Vararg{SliceIndex, N}) where {N} return getindex(StridedView(a), I...) end -@inline function sview(a::AbstractArray, I::SliceIndex) +function sview(a::AbstractArray, I::SliceIndex) return getindex(sreshape(StridedView(a), (length(a),)), I) end @@ -251,7 +254,7 @@ end # we cannot use Base.reshape, as this also accepts indices that might not preserve # stridedness sreshape(a, args::Vararg{Int}) = sreshape(a, args) -@inline function sreshape(a::StridedView{T}, newsize::Dims) where {T} +function sreshape(a::StridedView{T}, newsize::Dims) where {T} if any(isequal(0), newsize) any(isequal(0), size(a)) || throw(DimensionMismatch()) newstrides = one.(newsize) From f83ec7b6f46a06d34f7e382bfa092b82ad2dade6 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 11:11:16 -0400 Subject: [PATCH 2/7] Add a small PrecompileTools workload for core StridedView ops Warm the core `StridedView` specializations for the BLAS element types (`Float32`, `Float64`, `ComplexF32`, `ComplexF64`) over ndims 1:4 plus the 2D transpose/adjoint cases: construction, `permutedims`, `sreshape`, `sview`/slice `getindex`, `conj`, `transpose`/`adjoint`, and `size`/`strides`/`offset`. These are exactly the specializations downstream packages hit on their first call, so caching them removes that first-call latency. The workload is intentionally kept small (BLAS floats, ndims 1:4, identity/conj plus the 2D wrappers) to keep StridedViews' own precompile bounded. Measured (Julia 1.12.6, cold compiled-cache depot): - StridedViews cold precompile: ~0.53s -> ~2.29s (Pkg build line), i.e. ~+1.76s one-time, bounded. - First-call latency of the exercised core ops in a fresh process: ~1.78s -> ~0.027s (~66x), the inference cost being moved into the cached precompile. Bumps version to 0.5.2 and adds PrecompileTools to [deps]/[compat]. Co-Authored-By: Claude Opus 4.8 (1M context) --- Project.toml | 4 +++- src/StridedViews.jl | 1 + src/precompile.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 src/precompile.jl diff --git a/Project.toml b/Project.toml index b2aeba2..f303670 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,11 @@ name = "StridedViews" uuid = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" authors = ["Lukas Devos ", "Jutho Haegeman "] -version = "0.5.1" +version = "0.5.2" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" @@ -28,6 +29,7 @@ CUDACore = "6" JLArrays = "0.3.1" LinearAlgebra = "1" Metal = "1" +PrecompileTools = "1" PtrArrays = "1.2.0" julia = "1.10" diff --git a/src/StridedViews.jl b/src/StridedViews.jl index cfa01a9..dc2e1c2 100644 --- a/src/StridedViews.jl +++ b/src/StridedViews.jl @@ -9,5 +9,6 @@ export StridedView, sreshape, sview, isstrided include("auxiliary.jl") include("stridedview.jl") +include("precompile.jl") end diff --git a/src/precompile.jl b/src/precompile.jl new file mode 100644 index 0000000..797d349 --- /dev/null +++ b/src/precompile.jl @@ -0,0 +1,42 @@ +# Precompilation workload +# ------------------------ +# Cache the core `StridedView` specializations for the BLAS element types over a small +# range of dimensionalities and the op-wrappers a `StridedView` can carry. These are the +# specializations that downstream packages (e.g. TensorOperations / Strided) hit on their +# first call, so warming them here removes that first-call latency. +# +# The workload is deliberately kept small (BLAS floats, ndims 1:4, identity/conj plus the +# 2D transpose/adjoint cases) so that it adds only a bounded amount to StridedViews' own +# precompile time. +using PrecompileTools: @setup_workload, @compile_workload + +@setup_workload begin + @compile_workload begin + for T in (Float32, Float64, ComplexF32, ComplexF64) + # construction + property queries + core ops for ndims 1:4 + for N in 1:4 + A = Array{T, N}(undef, ntuple(_ -> 2, N)) + sv = StridedView(A) + size(sv) + strides(sv) + offset(sv) + conj(sv) + # permute through the identity permutation (exercises the per-N path) + permutedims(sv, ntuple(identity, N)) + # reshape to a flat vector and back (also exercises sview on the flat view) + flat = sreshape(sv, (length(sv),)) + sview(flat, 1:length(sv)) + getindex(sv, ntuple(_ -> 1, N)...) + end + # 2D matrix wrappers: transpose / adjoint + M = Array{T, 2}(undef, 2, 2) + svM = StridedView(M) + transpose(svM) + adjoint(svM) + # a representative 4D slice view (the SliceIndex `getindex` construction path) + A4 = Array{T, 4}(undef, 2, 2, 2, 2) + sv4 = StridedView(A4) + getindex(sv4, :, 1:2, 1, 1:2) + end + end +end From 565c833a7deff4856c34ee881878f8e7040ccf9f Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 11:27:29 -0400 Subject: [PATCH 3/7] restore inline for recursive functions --- src/auxiliary.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/auxiliary.jl b/src/auxiliary.jl index b36af18..0b19941 100644 --- a/src/auxiliary.jl +++ b/src/auxiliary.jl @@ -58,7 +58,7 @@ end #------------------------------ # Compute the new dimensions of a strided view given the original size and the view slicing # indices -function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N} +@inline function _computeviewsize(oldsize::NTuple{N, Int}, I::NTuple{N, SliceIndex}) where {N} if isa(I[1], Int) return _computeviewsize(tail(oldsize), tail(I)) elseif isa(I[1], Colon) @@ -71,7 +71,7 @@ _computeviewsize(::Tuple{}, ::Tuple{}) = () # Compute the new strides of a (strided) view given the original strides and the view # slicing indices -function _computeviewstrides( +@inline function _computeviewstrides( oldstrides::NTuple{N, Int}, I::NTuple{N, SliceIndex} ) where {N} @@ -90,7 +90,7 @@ _computeviewstrides(::Tuple{}, ::Tuple{}) = () # Compute the additional offset of a (strided) view given the original strides and the view # slicing indices -function _computeviewoffset( +@inline function _computeviewoffset( strides::NTuple{N, Int}, I::NTuple{N, SliceIndex} ) where {N} From 6937561d331345240c0810ce9765b1cea30b775a Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 11:27:41 -0400 Subject: [PATCH 4/7] increase precompile workload --- src/precompile.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/precompile.jl b/src/precompile.jl index 797d349..2aae20c 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -4,29 +4,29 @@ # range of dimensionalities and the op-wrappers a `StridedView` can carry. These are the # specializations that downstream packages (e.g. TensorOperations / Strided) hit on their # first call, so warming them here removes that first-call latency. -# -# The workload is deliberately kept small (BLAS floats, ndims 1:4, identity/conj plus the -# 2D transpose/adjoint cases) so that it adds only a bounded amount to StridedViews' own -# precompile time. using PrecompileTools: @setup_workload, @compile_workload @setup_workload begin @compile_workload begin for T in (Float32, Float64, ComplexF32, ComplexF64) # construction + property queries + core ops for ndims 1:4 - for N in 1:4 + for N in 1:6 A = Array{T, N}(undef, ntuple(_ -> 2, N)) sv = StridedView(A) size(sv) strides(sv) offset(sv) - conj(sv) + csv = conj(sv) # permute through the identity permutation (exercises the per-N path) permutedims(sv, ntuple(identity, N)) + permutedims(csv, ntuple(identity, N)) # reshape to a flat vector and back (also exercises sview on the flat view) flat = sreshape(sv, (length(sv),)) sview(flat, 1:length(sv)) getindex(sv, ntuple(_ -> 1, N)...) + flat = sreshape(csv, (length(sv),)) + sview(flat, 1:length(csv)) + getindex(csv, ntuple(_ -> 1, N)...) end # 2D matrix wrappers: transpose / adjoint M = Array{T, 2}(undef, 2, 2) From c798328c48b39b572410363c568a1c412e8013fe Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 11:27:49 -0400 Subject: [PATCH 5/7] remove slop --- src/stridedview.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/stridedview.jl b/src/stridedview.jl index 4d8a73a..b7e93c0 100644 --- a/src/stridedview.jl +++ b/src/stridedview.jl @@ -139,9 +139,6 @@ end end # Indexing with slice indices to create a new view. -# This builds a *new* view (a once-per-operation setup step, not a hot inner loop), so we -# deliberately do not force-inline it: `@inline` here duplicated the per-N size/stride/offset -# computation into every downstream caller instead of compiling it once per signature. function Base.getindex(a::StridedView{T, N}, I::Vararg{SliceIndex, N}) where {T, N} return StridedView{T}( a.parent, @@ -251,8 +248,7 @@ function Base.show(io::IO, e::ReshapeException) return print(io, msg) end -# we cannot use Base.reshape, as this also accepts indices that might not preserve -# stridedness +# we cannot use Base.reshape, as this also accepts indices that might not preserve stridedness sreshape(a, args::Vararg{Int}) = sreshape(a, args) function sreshape(a::StridedView{T}, newsize::Dims) where {T} if any(isequal(0), newsize) From 4c04cf9634bba534bf3405bb918b1f497bfd32db Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 11:40:45 -0400 Subject: [PATCH 6/7] bump precompiletools version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f303670..68fd3cf 100644 --- a/Project.toml +++ b/Project.toml @@ -29,7 +29,7 @@ CUDACore = "6" JLArrays = "0.3.1" LinearAlgebra = "1" Metal = "1" -PrecompileTools = "1" +PrecompileTools = "1.1" PtrArrays = "1.2.0" julia = "1.10" From 879f83c447b2e1307bf907f9ce875e7f278deff4 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Wed, 17 Jun 2026 12:46:40 -0400 Subject: [PATCH 7/7] attempt to fix JET --- test/jet/Project.toml | 3 --- test/jet/jet.jl | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/test/jet/Project.toml b/test/jet/Project.toml index 0c19e5a..68090c6 100644 --- a/test/jet/Project.toml +++ b/test/jet/Project.toml @@ -4,8 +4,5 @@ name = "StridedViewsJETTest" JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143" -[sources] -StridedViews = {path = "../.."} - [compat] JET = "0.9, 0.10, 0.11" diff --git a/test/jet/jet.jl b/test/jet/jet.jl index 2477de8..4955531 100644 --- a/test/jet/jet.jl +++ b/test/jet/jet.jl @@ -2,6 +2,7 @@ import Pkg try Pkg.activate(joinpath(@__DIR__); io = devnull) + Pkg.develop(Pkg.PackageSpec(path = joinpath(@__DIR__, "..", "..")); io = devnull) Pkg.instantiate(; io = devnull) @eval import JET JET.test_package(StridedViews; target_modules = (StridedViews,))