Scaling loop instead of broadcasting in strided matrix exp (#56463)

Firstly, this is easier to read. Secondly, this merges the two loops into one. Thirdly, this avoids the broadcasting latency. ```julia julia> using LinearAlgebra julia> A = rand(2,2); julia> @time LinearAlgebra.exp!(A); 0.952597 seconds (2.35 M allocations: 116.574 MiB, 2.67% gc time, 99.01% compilation time) # master 0.877404 seconds (2.17 M allocations: 106.293 MiB, 2.65% gc time, 99.99% compilation time) # this PR ``` The performance also improves as there are fewer allocations in the first branch (`opnorm(A, 1) <= 2.1`): ```julia julia> B = diagm(0=>im.*(float.(1:200))./200, 1=>(1:199)./400, -1=>(1:199)./400); julia> opnorm(B,1) 1.9875 julia> @Btime exp($B); 5.066 ms (30 allocations: 4.89 MiB) # nightly v"1.12.0-DEV.1581" 4.926 ms (27 allocations: 4.28 MiB) # this PR ```
JuliaLang · Nov 10, 2024 · 0cc5518 · 0cc5518
1 parent cd748a5
commit 0cc5518
Showing 1 changed file with 22 additions and 11 deletions.
diff --git a/stdlib/LinearAlgebra/src/dense.jl b/stdlib/LinearAlgebra/src/dense.jl
@@ -707,25 +707,32 @@ function exp!(A::StridedMatrix{T}) where T<:BlasFloat
         # Compute U and V: Even/odd terms in Padé numerator & denom
         # Expansion of k=1 in for loop
         P = A2
-        U = mul!(C[4]*P, true, C[2]*I, true, true) #U = C[2]*I + C[4]*P
-        V = mul!(C[3]*P, true, C[1]*I, true, true) #V = C[1]*I + C[3]*P
+        U = similar(P)
+        V = similar(P)
+        for ind in CartesianIndices(P)
+            U[ind] = C[4]*P[ind] + C[2]*I[ind]
+            V[ind] = C[3]*P[ind] + C[1]*I[ind]
+        end
         for k in 2:(div(length(C), 2) - 1)
             P *= A2
-            for ind in eachindex(P)
+            for ind in eachindex(P, U, V)
                 U[ind] += C[2k + 2] * P[ind]
                 V[ind] += C[2k + 1] * P[ind]
             end
         end
 
-        U = A * U
+        # U = A * U, but we overwrite P to avoid an allocation
+        mul!(P, A, U)
+        # P may be seen as an alias for U in the following code
 
         # Padé approximant:  (V-U)\(V+U)
-        tmp1, tmp2 = A, A2 # Reuse already allocated arrays
-        for ind in eachindex(tmp1)
-            tmp1[ind] = V[ind] - U[ind]
-            tmp2[ind] = V[ind] + U[ind]
+        VminU, VplusU = V, U # Reuse already allocated arrays
+        for ind in eachindex(V, U)
+            vi, ui = V[ind], P[ind]
+            VminU[ind] = vi - ui
+            VplusU[ind] = vi + ui
         end
-        X = LAPACK.gesv!(tmp1, tmp2)[1]
+        X = LAPACK.gesv!(VminU, VplusU)[1]
     else
         s  = log2(nA/5.4)               # power of 2 later reversed by squaring
         if s > 0
@@ -793,10 +800,14 @@ function exp!(A::StridedMatrix{T}) where T<:BlasFloat
     end
 
     if ilo > 1       # apply lower permutations in reverse order
-        for j in (ilo-1):-1:1; rcswap!(j, Int(scale[j]), X) end
+        for j in (ilo-1):-1:1
+            rcswap!(j, Int(scale[j]), X)
+        end
     end
     if ihi < n       # apply upper permutations in forward order
-        for j in (ihi+1):n;    rcswap!(j, Int(scale[j]), X) end
+        for j in (ihi+1):n
+            rcswap!(j, Int(scale[j]), X)
+        end
     end
     X
 end