Skip to content

Commit

Permalink
Scaling loop instead of broadcasting in strided matrix exp (#56463)
Browse files Browse the repository at this point in the history
Firstly, this is easier to read. Secondly, this merges the two loops
into one. Thirdly, this avoids the broadcasting latency.
```julia
julia> using LinearAlgebra

julia> A = rand(2,2);

julia> @time LinearAlgebra.exp!(A);
  0.952597 seconds (2.35 M allocations: 116.574 MiB, 2.67% gc time, 99.01% compilation time) # master
  0.877404 seconds (2.17 M allocations: 106.293 MiB, 2.65% gc time, 99.99% compilation time) # this PR
```
The performance also improves as there are fewer allocations in the
first branch (`opnorm(A, 1) <= 2.1`):
```julia
julia> B = diagm(0=>im.*(float.(1:200))./200, 1=>(1:199)./400, -1=>(1:199)./400);

julia> opnorm(B,1)
1.9875

julia> @Btime exp($B);
  5.066 ms (30 allocations: 4.89 MiB) # nightly v"1.12.0-DEV.1581"
  4.926 ms (27 allocations: 4.28 MiB) # this PR
```
  • Loading branch information
jishnub authored Nov 10, 2024
1 parent cd748a5 commit 0cc5518
Showing 1 changed file with 22 additions and 11 deletions.
33 changes: 22 additions & 11 deletions stdlib/LinearAlgebra/src/dense.jl
Original file line number Diff line number Diff line change
Expand Up @@ -707,25 +707,32 @@ function exp!(A::StridedMatrix{T}) where T<:BlasFloat
# Compute U and V: Even/odd terms in Padé numerator & denom
# Expansion of k=1 in for loop
P = A2
U = mul!(C[4]*P, true, C[2]*I, true, true) #U = C[2]*I + C[4]*P
V = mul!(C[3]*P, true, C[1]*I, true, true) #V = C[1]*I + C[3]*P
U = similar(P)
V = similar(P)
for ind in CartesianIndices(P)
U[ind] = C[4]*P[ind] + C[2]*I[ind]
V[ind] = C[3]*P[ind] + C[1]*I[ind]
end
for k in 2:(div(length(C), 2) - 1)
P *= A2
for ind in eachindex(P)
for ind in eachindex(P, U, V)
U[ind] += C[2k + 2] * P[ind]
V[ind] += C[2k + 1] * P[ind]
end
end

U = A * U
# U = A * U, but we overwrite P to avoid an allocation
mul!(P, A, U)
# P may be seen as an alias for U in the following code

# Padé approximant: (V-U)\(V+U)
tmp1, tmp2 = A, A2 # Reuse already allocated arrays
for ind in eachindex(tmp1)
tmp1[ind] = V[ind] - U[ind]
tmp2[ind] = V[ind] + U[ind]
VminU, VplusU = V, U # Reuse already allocated arrays
for ind in eachindex(V, U)
vi, ui = V[ind], P[ind]
VminU[ind] = vi - ui
VplusU[ind] = vi + ui
end
X = LAPACK.gesv!(tmp1, tmp2)[1]
X = LAPACK.gesv!(VminU, VplusU)[1]
else
s = log2(nA/5.4) # power of 2 later reversed by squaring
if s > 0
Expand Down Expand Up @@ -793,10 +800,14 @@ function exp!(A::StridedMatrix{T}) where T<:BlasFloat
end

if ilo > 1 # apply lower permutations in reverse order
for j in (ilo-1):-1:1; rcswap!(j, Int(scale[j]), X) end
for j in (ilo-1):-1:1
rcswap!(j, Int(scale[j]), X)
end
end
if ihi < n # apply upper permutations in forward order
for j in (ihi+1):n; rcswap!(j, Int(scale[j]), X) end
for j in (ihi+1):n
rcswap!(j, Int(scale[j]), X)
end
end
X
end
Expand Down

0 comments on commit 0cc5518

Please sign in to comment.