From ec979688f3d3c13a97f28dae2751634ac9ba80ea Mon Sep 17 00:00:00 2001 From: Matt Bauman Date: Wed, 6 Feb 2019 02:21:41 -0500 Subject: [PATCH] Workaround #28126, support SIMDing broadcast in more cases This is an ugly performance hack around issue #28126 in some limited (but common) cases. The problem in short: when given many arrays of the same size, LLVM has difficulty hoisting the decision of whether a given dimension should be "extruded" out of the loop. This extra indirection in the index computation seems to foil the array bounds aliasing checks, which stymies SIMDification. The solution: check to see if _Julia_ can statically decide whether or not to extrude any dimensions in a given broadcast expression -- and if so, use a special array wrapper that flags that none of the dimensions in that array need to be extruded out in order to perform the broadcast. --- base/broadcast.jl | 51 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index 24d0424a89727..ac4da37733c5a 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -864,13 +864,14 @@ broadcast_unalias(::Nothing, src) = src # Preprocessing a `Broadcasted` does two things: # * unaliases any arguments from `dest` -# * "extrudes" the arguments where it is advantageous to pre-compute the broadcasted indices -@inline preprocess(dest, bc::Broadcasted{Style}) where {Style} = Broadcasted{Style}(bc.f, preprocess_args(dest, bc.args), bc.axes) -preprocess(dest, x) = extrude(broadcast_unalias(dest, x)) +# * calls `f` on the arguments (typically `extrude`, which pre-computes the broadcasted indices where advantageous) +@inline preprocess(dest, bc) = preprocess(extrude, dest, bc) +@inline preprocess(f, dest, bc::Broadcasted{Style}) where {Style} = Broadcasted{Style}(bc.f, preprocess_args(f, dest, bc.args), bc.axes) +preprocess(f, dest, x) = f(broadcast_unalias(dest, x)) -@inline preprocess_args(dest, args::Tuple) = (preprocess(dest, args[1]), preprocess_args(dest, tail(args))...) -preprocess_args(dest, args::Tuple{Any}) = (preprocess(dest, args[1]),) -preprocess_args(dest, args::Tuple{}) = () +@inline preprocess_args(f, dest, args::Tuple) = (preprocess(f, dest, args[1]), preprocess_args(f, dest, tail(args))...) +preprocess_args(f, dest, args::Tuple{Any}) = (preprocess(f, dest, args[1]),) +preprocess_args(f, dest, args::Tuple{}) = () # Specialize this method if all you want to do is specialize on typeof(dest) @inline function copyto!(dest::AbstractArray, bc::Broadcasted{Nothing}) @@ -882,13 +883,45 @@ preprocess_args(dest, args::Tuple{}) = () return copyto!(dest, A) end end - bc′ = preprocess(dest, bc) - @simd for I in eachindex(bc′) - @inbounds dest[I] = bc′[I] + # Ugly performance hack around issue #28126: determine if all arguments to the + # broadcast are sized such that the broadcasting core can statically determine + # whether a given dimension is "extruded" or not. If so, we don't need to check + # any array sizes within the inner loop. Ideally this really should be something + # that Julia and/or LLVM could figure out and eliminate... and indeed they can + # for limited numbers of arguments. + if _is_static_broadcast_28126(dest, bc) + bcs′ = preprocess(_nonextrude_28126, dest, bc) + @simd for I in eachindex(bcs′) + @inbounds dest[I] = bcs′[I] + end + else + bc′ = preprocess(extrude, dest, bc) + @simd for I in eachindex(bc′) + @inbounds dest[I] = bc′[I] + end end return dest end +@inline _is_static_broadcast_28126(dest, bc::Broadcasted{Style}) where {Style} = _is_static_broadcast_28126_args(dest, bc.args) +_is_static_broadcast_28126(dest, x) = false +_is_static_broadcast_28126(dest, x::Union{Ref, Tuple, Type, Number, AbstractArray{<:Any,0}}) = true +_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,0}) = true +_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,1}) = axes(dest, 1) == axes(x, 1) +_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray) = axes(dest) == axes(x) # This can be better with other missing dimensions + +@inline _is_static_broadcast_28126_args(dest, args::Tuple) = _is_static_broadcast_28126(dest, args[1]) && _is_static_broadcast_28126_args(dest, tail(args)) +_is_static_broadcast_28126_args(dest, args::Tuple{Any}) = _is_static_broadcast_28126(dest, args[1]) +_is_static_broadcast_28126_args(dest, args::Tuple{}) = true + +struct _NonExtruded28126{T} + x::T +end +@inline axes(b::_NonExtruded28126) = axes(b.x) +Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126, i) = b.x[i] +_nonextrude_28126(x::AbstractArray) = _NonExtruded28126(x) +_nonextrude_28126(x) = x + # Performance optimization: for BitArray outputs, we cache the result # in a "small" Vector{Bool}, and then copy in chunks into the output @inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})