@avx messes with type inference #114

baggepinnen · 2020-05-12T11:15:40Z

The following function infers fine for Float64 but not for Float32. If I remove @avx, it infers fine for both.

function running_mean_std(x::AbstractArray{T}, m) where T
    @assert length(x) >= m
    n = length(x)-m+1
    s = ss = zero(float(T))
    μ = Vector{float(T)}(undef, n)
    σ = Vector{float(T)}(undef, n)
    @avx for i = 1:m
        s  += x[i]
        ss += x[i]^2
    end
    μ[1] = s/m
    σ[1] = sqrt(ss/m - μ[1]^2)
    @fastmath @inbounds for i = 1:n-1
        s -= x[i]
        ss -= x[i]^2
        s += x[i+m]
        ss += x[i+m]^2
        μ[i+1] = s/m
        σ[i+1] = sqrt(ss/m - μ[i+1]^2)
    end
    μ,σ
end

@code_warntype running_mean_std(randn(Float64, 10), 3) # Fine
@code_warntype running_mean_std(randn(Float32, 10), 3) # Not fine

The text was updated successfully, but these errors were encountered:

chriselrod · 2020-05-12T12:59:26Z

On the latest release:

julia> @code_warntype running_mean_std(randn(Float32, 10), 3) # Not fine
Variables
  #self#::Core.Compiler.Const(running_mean_std, false)
  x::Array{Float32,1}
  m::Int64
  ss_0::NTuple{16,VecElement{Float32}}
  s_0::NTuple{16,VecElement{Float32}}
  val@_6::Union{}
  val@_7::Nothing
  n::Int64
  μ::Array{Float32,1}
  σ::Array{Float32,1}
  vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
  @_12::Int64
  s::Float32
  ss::Float32
  @_15::Union{}
  @_16::Union{Nothing, Tuple{Int64,Int64}}
  i@_17::Union{}
  i@_18::Int64

Body::Tuple{Array{Float32,1},Array{Float32,1}}
1 ─        Core.NewvarNode(:(ss_0))
│          Core.NewvarNode(:(s_0))
│          Core.NewvarNode(:(val@_6))
│          Core.NewvarNode(:(val@_7))
│          Core.NewvarNode(:(n))
│          Core.NewvarNode(:(μ))
│          Core.NewvarNode(:(σ))
│          Core.NewvarNode(:(vptr##_x))
│          Core.NewvarNode(:(@_12))
│          Core.NewvarNode(:(s))
│          Core.NewvarNode(:(ss))
│          Core.NewvarNode(:(@_15))
│          Core.NewvarNode(:(@_16))
│   %14  = Main.length(x)::Int64
│   %15  = (%14 >= m)::Bool
└──        goto #3 if not %15
2 ─        goto #4
3 ─ %18  = Base.AssertionError("length(x) >= m")::AssertionError
└──        Base.throw(%18)
4 ┄ %20  = Main.length(x)::Int64
│   %21  = (%20 - m)::Int64
│          (n = %21 + 1)
│   %23  = Main.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %24  = Main.zero(%23)::Core.Compiler.Const(0.0f0, false)
│          (ss = %24)
│          (s = %24)
│   %27  = Main.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %28  = Core.apply_type(Main.Vector, %27)::Core.Compiler.Const(Array{Float32,1}, false)
│          (μ = (%28)(Main.undef, n))
│   %30  = Main.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %31  = Core.apply_type(Main.Vector, %30)::Core.Compiler.Const(Array{Float32,1}, false)
│          (σ = (%31)(Main.undef, n))
│   %33  = LoopVectorization.check_args(x)::Core.Compiler.Const(true, false)
│          %33
│          (vptr##_x = LoopVectorization.stridedpointer(x))
│   %36  = $(Expr(:gc_preserve_begin, :(x)))
│   %37  = Core.apply_type(Main.Val, (0, 0, 0))::Core.Compiler.Const(Val{(0, 0, 0)}, false)
│   %38  = (%37)()::Core.Compiler.Const(Val{(0, 0, 0)}(), false)
│   %39  = Core.apply_type(Main.Tuple, :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x03), :LoopVectorization, :vabs2, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000002, LoopVectorization.compute, 0x00, 0x04), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000405, LoopVectorization.compute, 0x00, 0x03))::Core.Compiler.Const(Tuple{:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01),:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x03),:LoopVectorization,:vabs2,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000002, LoopVectorization.compute, 0x00, 0x04),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000405, LoopVectorization.compute, 0x00, 0x03)}, false)
│   %40  = Core.apply_type(Main.Tuple, LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000))::Core.Compiler.Const(Tuple{LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000)}, false)
│   %41  = Core.apply_type(Main.Tuple, 3, 6)::Core.Compiler.Const(Tuple{3,6}, false)
│   %42  = Core.apply_type(Main.Tuple, 1, 4)::Core.Compiler.Const(Tuple{1,4}, false)
│   %43  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %44  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %45  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %46  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %47  = Core.apply_type(Main.Tuple, 0, %41, %42, %43, %44, %45, %46)::Core.Compiler.Const(Tuple{0,Tuple{3,6},Tuple{1,4},Tuple{},Tuple{},Tuple{},Tuple{}}, false)
│   %48  = Core.apply_type(Main.Tuple, :i)::Core.Compiler.Const(Tuple{:i}, false)
│   %49  = Core.apply_type(LoopVectorization.StaticLowerUnitRange, 1)::Core.Compiler.Const(VectorizationBase.StaticLowerUnitRange{1}, false)
│   %50  = (%49)(m)::VectorizationBase.StaticLowerUnitRange{1}
│   %51  = Core.tuple(%50)::Tuple{VectorizationBase.StaticLowerUnitRange{1}}
│   %52  = vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
│   %53  = s::Core.Compiler.Const(0.0f0, false)::Core.Compiler.Const(0.0f0, false)
│   %54  = LoopVectorization._avx_!(%38, %39, %40, %47, %48, %51, %52, %53, ss::Core.Compiler.Const(0.0f0, false))::Tuple{NTuple{16,VecElement{Float32}},NTuple{16,VecElement{Float32}}}
│   %55  = Base.indexed_iterate(%54, 1)::Core.Compiler.PartialStruct(Tuple{NTuple{16,VecElement{Float32}},Int64}, Any[NTuple{16,VecElement{Float32}}, Core.Compiler.Const(2, false)])
│          (s_0 = Core.getfield(%55, 1))
│          (@_12 = Core.getfield(%55, 2))
│   %58  = Base.indexed_iterate(%54, 2, @_12::Core.Compiler.Const(2, false))::Core.Compiler.PartialStruct(Tuple{NTuple{16,VecElement{Float32}},Int64}, Any[NTuple{16,VecElement{Float32}}, Core.Compiler.Const(3, false)])
│          (ss_0 = Core.getfield(%58, 1))
│          $(Expr(:gc_preserve_end, :(%36)))
│          (s = LoopVectorization.reduced_add(s_0, s::Core.Compiler.Const(0.0f0, false)))
│          (ss = LoopVectorization.reduced_add(ss_0, ss::Core.Compiler.Const(0.0f0, false)))
└──        goto #6
5 ─        $(Expr(:inbounds, true))
│          Core.Compiler.Const(:(1:m), false)
│          Core.Compiler.Const(:(@_15 = Base.iterate(%65)), false)
│          Core.Compiler.Const(:(@_15 === nothing), false)
│          Core.Compiler.Const(:(Base.not_int(%67)), false)
│          Core.Compiler.Const(:(%68), false)
│          Core.Compiler.Const(:(@_15), false)
│          Core.Compiler.Const(:(i@_17 = Core.getfield(%70, 1)), false)
│          Core.Compiler.Const(:(Core.getfield(%70, 2)), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%73, :add_fast)), false)
│          Core.Compiler.Const(:(s), false)
│          Core.Compiler.Const(:(Base.getindex(x, i@_17)), false)
│          Core.Compiler.Const(:(s = (%74)(%75, %76)), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%78, :add_fast)), false)
│          Core.Compiler.Const(:(ss), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%81, :pow_fast)), false)
│          Core.Compiler.Const(:(Base.getindex(x, i@_17)), false)
│          Core.Compiler.Const(:((%82)(%83, Val{2}())), false)
│          Core.Compiler.Const(:(ss = (%79)(%80, %84)), false)
│          Core.Compiler.Const(:(@_15 = Base.iterate(%65, %72)), false)
│          Core.Compiler.Const(:(@_15 === nothing), false)
│          Core.Compiler.Const(:(Base.not_int(%87)), false)
│          Core.Compiler.Const(:(%88), false)
│          Core.Compiler.Const(:(goto %70), false)
│          Core.Compiler.Const(:(val@_6 = nothing), false)
│          $(Expr(:inbounds, :pop))
└──        Core.Compiler.Const(:(val@_6), false)
6 ┄ %94  = (s / m)::Float32
│          Base.setindex!(μ, %94, 1)
│   %96  = (ss / m)::Float32
│   %97  = Base.getindex(μ, 1)::Float32
│   %98  = Core.apply_type(Base.Val, 2)::Core.Compiler.Const(Val{2}, false)
│   %99  = (%98)()::Core.Compiler.Const(Val{2}(), false)
│   %100 = Base.literal_pow(Main.:^, %97, %99)::Float32
│   %101 = (%96 - %100)::Float32
│   %102 = Main.sqrt(%101)::Float32
│          Base.setindex!(σ, %102, 1)
│          $(Expr(:inbounds, true))
│   %105 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %106 = Base.getproperty(%105, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %107 = n::Int64
│   %108 = (%106)(%107, 1)::Int64
│   %109 = (1:%108)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
│          (@_16 = Base.iterate(%109))
│   %111 = (@_16 === nothing)::Bool
│   %112 = Base.not_int(%111)::Bool
└──        goto #9 if not %112
7 ┄ %114 = @_16::Tuple{Int64,Int64}::Tuple{Int64,Int64}
│          (i@_18 = Core.getfield(%114, 1))
│   %116 = Core.getfield(%114, 2)::Int64
│   %117 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %118 = Base.getproperty(%117, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %119 = s::Float32
│   %120 = Base.getindex(x, i@_18)::Float32
│          (s = (%118)(%119, %120))
│   %122 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %123 = Base.getproperty(%122, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %124 = ss::Float32
│   %125 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %126 = Base.getproperty(%125, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %127 = Base.getindex(x, i@_18)::Float32
│   %128 = (%126)(%127, Val{2}())::Float32
│          (ss = (%123)(%124, %128))
│   %130 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %131 = Base.getproperty(%130, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %132 = s::Float32
│   %133 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %134 = Base.getproperty(%133, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %135 = i@_18::Int64
│   %136 = (%134)(%135, m)::Int64
│   %137 = Base.getindex(x, %136)::Float32
│          (s = (%131)(%132, %137))
│   %139 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %140 = Base.getproperty(%139, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %141 = ss::Float32
│   %142 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %143 = Base.getproperty(%142, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %144 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %145 = Base.getproperty(%144, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %146 = i@_18::Int64
│   %147 = (%145)(%146, m)::Int64
│   %148 = Base.getindex(x, %147)::Float32
│   %149 = (%143)(%148, Val{2}())::Float32
│          (ss = (%140)(%141, %149))
│   %151 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %152 = Base.getproperty(%151, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│   %153 = s::Float32
│   %154 = (%152)(%153, m)::Float32
│   %155 = μ::Array{Float32,1}
│   %156 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %157 = Base.getproperty(%156, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %158 = i@_18::Int64
│   %159 = (%157)(%158, 1)::Int64
│          Base.setindex!(%155, %154, %159)
│   %161 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %162 = Base.getproperty(%161, :sqrt_fast)::Core.Compiler.Const(Base.FastMath.sqrt_fast, false)
│   %163 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %164 = Base.getproperty(%163, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %165 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %166 = Base.getproperty(%165, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│   %167 = ss::Float32
│   %168 = (%166)(%167, m)::Float32
│   %169 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %170 = Base.getproperty(%169, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %171 = μ::Array{Float32,1}
│   %172 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %173 = Base.getproperty(%172, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %174 = i@_18::Int64
│   %175 = (%173)(%174, 1)::Int64
│   %176 = Base.getindex(%171, %175)::Float32
│   %177 = (%170)(%176, Val{2}())::Float32
│   %178 = (%164)(%168, %177)::Float32
│   %179 = (%162)(%178)::Float32
│   %180 = σ::Array{Float32,1}
│   %181 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %182 = Base.getproperty(%181, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %183 = i@_18::Int64
│   %184 = (%182)(%183, 1)::Int64
│          Base.setindex!(%180, %179, %184)
│          (@_16 = Base.iterate(%109, %116))
│   %187 = (@_16 === nothing)::Bool
│   %188 = Base.not_int(%187)::Bool
└──        goto #9 if not %188
8 ─        goto #7
9 ┄        (val@_7 = nothing)
│          $(Expr(:inbounds, :pop))
│          val@_7
│   %194 = Core.tuple(μ, σ)::Tuple{Array{Float32,1},Array{Float32,1}}
└──        return %194

I get more or less the same thing on master.

Which version are you on?

EDIT:
Oops, looks like you meant the second loop that currently has @fastmath @inbounds instead of @avx. Except on Julia 1.1, it also isn't type stable with Float64 in that case.

baggepinnen · 2020-05-12T13:45:28Z

I meant the first loop that was decorated with @avx. I was on LV 0.7.7 but the problem is the same on 0.7.8. Here is my code warntype for Float32, with Float64 all the Any goes away

julia> @code_warntype running_mean_std(randn(Float32, 10), 3)
Variables
  #self#::Core.Compiler.Const(SlidingDistancesBase.running_mean_std, false)
  x::Array{Float32,1}
  m::Int64
  ss_0::Any
  s_0::Any
  val@_6::Union{}
  val@_7::Nothing
  n::Int64
  μ::Array{Float32,1}
  σ::Array{Float32,1}
  vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
  @_12::Tuple{Any,Any}
  @_13::Int64
  s::Any
  ss::Any
  @_16::Union{}
  @_17::Union{Nothing, Tuple{Int64,Int64}}
  i@_18::Union{}
  i@_19::Int64

Body::Tuple{Array{Float32,1},Array{Float32,1}}
1 ─        Core.NewvarNode(:(ss_0))
│          Core.NewvarNode(:(s_0))
│          Core.NewvarNode(:(val@_6))
│          Core.NewvarNode(:(val@_7))
│          Core.NewvarNode(:(n))
│          Core.NewvarNode(:(μ))
│          Core.NewvarNode(:(σ))
│          Core.NewvarNode(:(vptr##_x))
│          Core.NewvarNode(:(@_12))
│          Core.NewvarNode(:(@_13))
│          Core.NewvarNode(:(s))
│          Core.NewvarNode(:(ss))
│          Core.NewvarNode(:(@_16))
│          Core.NewvarNode(:(@_17))
│   %15  = SlidingDistancesBase.length(x)::Int64
│   %16  = (%15 >= m)::Bool
└──        goto #3 if not %16
2 ─        goto #4
3 ─ %19  = Base.AssertionError("length(x) >= m")::AssertionError
└──        Base.throw(%19)
4 ┄ %21  = SlidingDistancesBase.length(x)::Int64
│   %22  = (%21 - m)::Int64
│          (n = %22 + 1)
│   %24  = SlidingDistancesBase.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %25  = SlidingDistancesBase.zero(%24)::Core.Compiler.Const(0.0f0, false)
│          (ss = %25)
│          (s = %25)
│   %28  = SlidingDistancesBase.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %29  = Core.apply_type(SlidingDistancesBase.Vector, %28)::Core.Compiler.Const(Array{Float32,1}, false)
│          (μ = (%29)(SlidingDistancesBase.undef, n))
│   %31  = SlidingDistancesBase.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %32  = Core.apply_type(SlidingDistancesBase.Vector, %31)::Core.Compiler.Const(Array{Float32,1}, false)
│          (σ = (%32)(SlidingDistancesBase.undef, n))
│   %34  = LoopVectorization.check_args(x)::Core.Compiler.Const(true, false)
│          %34
│          (vptr##_x = LoopVectorization.stridedpointer(x))
│   %37  = $(Expr(:gc_preserve_begin, :(x)))
│   %38  = Core.apply_type(SlidingDistancesBase.Val, (0, 0, 0))::Core.Compiler.Const(Val{(0, 0, 0)}, false)
│   %39  = (%38)()::Core.Compiler.Const(Val{(0, 0, 0)}(), false)
│   %40  = Core.apply_type(SlidingDistancesBase.Tuple, :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x03), :LoopVectorization, :vabs2, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000002, LoopVectorization.compute, 0x00, 0x04), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000405, LoopVectorization.compute, 0x00, 0x03))::Core.Compiler.Const(Tuple{:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01),:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x03),:LoopVectorization,:vabs2,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000002, LoopVectorization.compute, 0x00, 0x04),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000405, LoopVectorization.compute, 0x00, 0x03)}, false)
│   %41  = Core.apply_type(SlidingDistancesBase.Tuple, LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000))::Core.Compiler.Const(Tuple{LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000)}, false)
│   %42  = Core.apply_type(SlidingDistancesBase.Tuple, 3, 6)::Core.Compiler.Const(Tuple{3,6}, false)
│   %43  = Core.apply_type(SlidingDistancesBase.Tuple, 1, 4)::Core.Compiler.Const(Tuple{1,4}, false)
│   %44  = Core.apply_type(SlidingDistancesBase.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %45  = Core.apply_type(SlidingDistancesBase.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %46  = Core.apply_type(SlidingDistancesBase.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %47  = Core.apply_type(SlidingDistancesBase.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %48  = Core.apply_type(SlidingDistancesBase.Tuple, 0, %42, %43, %44, %45, %46, %47)::Core.Compiler.Const(Tuple{0,Tuple{3,6},Tuple{1,4},Tuple{},Tuple{},Tuple{},Tuple{}}, false)
│   %49  = Core.apply_type(SlidingDistancesBase.Tuple, :i)::Core.Compiler.Const(Tuple{:i}, false)
│   %50  = Core.apply_type(LoopVectorization.StaticLowerUnitRange, 1)::Core.Compiler.Const(VectorizationBase.StaticLowerUnitRange{1}, false)
│   %51  = (%50)(m)::VectorizationBase.StaticLowerUnitRange{1}
│   %52  = Core.tuple(%51)::Tuple{VectorizationBase.StaticLowerUnitRange{1}}
│   %53  = vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
│   %54  = s::Core.Compiler.Const(0.0f0, false)::Core.Compiler.Const(0.0f0, false)
│   %55  = LoopVectorization._avx_!(%39, %40, %41, %48, %49, %52, %53, %54, ss::Core.Compiler.Const(0.0f0, false))::Tuple{Any,Any}
│   %56  = Base.indexed_iterate(%55, 1)::Core.Compiler.PartialStruct(Tuple{Any,Int64}, Any[Any, Core.Compiler.Const(2, false)])
│          (s_0 = Core.getfield(%56, 1))
│          (@_13 = Core.getfield(%56, 2))
│   %59  = Base.indexed_iterate(%55, 2, @_13::Core.Compiler.Const(2, false))::Core.Compiler.PartialStruct(Tuple{Any,Int64}, Any[Any, Core.Compiler.Const(3, false)])
│          (ss_0 = Core.getfield(%59, 1))
│          (@_12 = %55)
│          $(Expr(:gc_preserve_end, :(%37)))
│          @_12
│          (s = LoopVectorization.reduced_add(s_0, s::Core.Compiler.Const(0.0f0, false)))
│          (ss = LoopVectorization.reduced_add(ss_0, ss::Core.Compiler.Const(0.0f0, false)))
└──        goto #6
5 ─        $(Expr(:inbounds, true))
│          Core.Compiler.Const(:(1:m), false)
│          Core.Compiler.Const(:(@_16 = Base.iterate(%68)), false)
│          Core.Compiler.Const(:(@_16 === nothing), false)
│          Core.Compiler.Const(:(Base.not_int(%70)), false)
│          Core.Compiler.Const(:(%71), false)
│          Core.Compiler.Const(:(@_16), false)
│          Core.Compiler.Const(:(i@_18 = Core.getfield(%73, 1)), false)
│          Core.Compiler.Const(:(Core.getfield(%73, 2)), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%76, :add_fast)), false)
│          Core.Compiler.Const(:(s), false)
│          Core.Compiler.Const(:(Base.getindex(x, i@_18)), false)
│          Core.Compiler.Const(:(s = (%77)(%78, %79)), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%81, :add_fast)), false)
│          Core.Compiler.Const(:(ss), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%84, :pow_fast)), false)
│          Core.Compiler.Const(:(Base.getindex(x, i@_18)), false)
│          Core.Compiler.Const(:((%85)(%86, Val{2}())), false)
│          Core.Compiler.Const(:(ss = (%82)(%83, %87)), false)
│          Core.Compiler.Const(:(@_16 = Base.iterate(%68, %75)), false)
│          Core.Compiler.Const(:(@_16 === nothing), false)
│          Core.Compiler.Const(:(Base.not_int(%90)), false)
│          Core.Compiler.Const(:(%91), false)
│          Core.Compiler.Const(:(goto %73), false)
│          Core.Compiler.Const(:(val@_6 = nothing), false)
│          $(Expr(:inbounds, :pop))
└──        Core.Compiler.Const(:(val@_6), false)
6 ┄ %97  = (s / m)::Any
│          Base.setindex!(μ, %97, 1)
│   %99  = (ss / m)::Any
│   %100 = Base.getindex(μ, 1)::Float32
│   %101 = Core.apply_type(Base.Val, 2)::Core.Compiler.Const(Val{2}, false)
│   %102 = (%101)()::Core.Compiler.Const(Val{2}(), false)
│   %103 = Base.literal_pow(SlidingDistancesBase.:^, %100, %102)::Float32
│   %104 = (%99 - %103)::Any
│   %105 = SlidingDistancesBase.sqrt(%104)::Any
│          Base.setindex!(σ, %105, 1)
│          $(Expr(:inbounds, true))
│   %108 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %109 = Base.getproperty(%108, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %110 = n::Int64
│   %111 = (%109)(%110, 1)::Int64
│   %112 = (1:%111)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
│          (@_17 = Base.iterate(%112))
│   %114 = (@_17 === nothing)::Bool
│   %115 = Base.not_int(%114)::Bool
└──        goto #9 if not %115
7 ┄ %117 = @_17::Tuple{Int64,Int64}::Tuple{Int64,Int64}
│          (i@_19 = Core.getfield(%117, 1))
│   %119 = Core.getfield(%117, 2)::Int64
│   %120 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %121 = Base.getproperty(%120, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %122 = s::Any
│   %123 = Base.getindex(x, i@_19)::Float32
│          (s = (%121)(%122, %123))
│   %125 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %126 = Base.getproperty(%125, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %127 = ss::Any
│   %128 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %129 = Base.getproperty(%128, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %130 = Base.getindex(x, i@_19)::Float32
│   %131 = (%129)(%130, Val{2}())::Float32
│          (ss = (%126)(%127, %131))
│   %133 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %134 = Base.getproperty(%133, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %135 = s::Any
│   %136 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %137 = Base.getproperty(%136, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %138 = i@_19::Int64
│   %139 = (%137)(%138, m)::Int64
│   %140 = Base.getindex(x, %139)::Float32
│          (s = (%134)(%135, %140))
│   %142 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %143 = Base.getproperty(%142, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %144 = ss::Any
│   %145 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %146 = Base.getproperty(%145, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %147 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %148 = Base.getproperty(%147, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %149 = i@_19::Int64
│   %150 = (%148)(%149, m)::Int64
│   %151 = Base.getindex(x, %150)::Float32
│   %152 = (%146)(%151, Val{2}())::Float32
│          (ss = (%143)(%144, %152))
│   %154 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %155 = Base.getproperty(%154, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│   %156 = s::Any
│   %157 = (%155)(%156, m)::Any
│   %158 = μ::Array{Float32,1}
│   %159 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %160 = Base.getproperty(%159, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %161 = i@_19::Int64
│   %162 = (%160)(%161, 1)::Int64
│          Base.setindex!(%158, %157, %162)
│   %164 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %165 = Base.getproperty(%164, :sqrt_fast)::Core.Compiler.Const(Base.FastMath.sqrt_fast, false)
│   %166 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %167 = Base.getproperty(%166, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %168 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %169 = Base.getproperty(%168, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│   %170 = ss::Any
│   %171 = (%169)(%170, m)::Any
│   %172 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %173 = Base.getproperty(%172, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %174 = μ::Array{Float32,1}
│   %175 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %176 = Base.getproperty(%175, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %177 = i@_19::Int64
│   %178 = (%176)(%177, 1)::Int64
│   %179 = Base.getindex(%174, %178)::Float32
│   %180 = (%173)(%179, Val{2}())::Float32
│   %181 = (%167)(%171, %180)::Any
│   %182 = (%165)(%181)::Any
│   %183 = σ::Array{Float32,1}
│   %184 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %185 = Base.getproperty(%184, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %186 = i@_19::Int64
│   %187 = (%185)(%186, 1)::Int64
│          Base.setindex!(%183, %182, %187)
│          (@_17 = Base.iterate(%112, %119))
│   %190 = (@_17 === nothing)::Bool
│   %191 = Base.not_int(%190)::Bool
└──        goto #9 if not %191
8 ─        goto #7
9 ┄        (val@_7 = nothing)
│          $(Expr(:inbounds, :pop))
│          val@_7
│   %197 = Core.tuple(μ, σ)::Tuple{Array{Float32,1},Array{Float32,1}}
└──        return %197

julia>

And versioninfo

(@v1.4) pkg> st LoopVectorization
Status `~/.julia/environments/v1.4/Project.toml`
  [bdcacae8] LoopVectorization v0.7.8

julia> versioninfo()
Julia Version 1.4.1
Commit 381693d3df (2020-04-14 17:20 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: Intel(R) Core(TM) i5-2500K CPU @ 3.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, sandybridge)
Environment:
  JULIA_EDITOR = atom  -a
  JULIA_NUM_THREADS = 4
  JULIA_CUDA_VERBOSE = true

baggepinnen · 2020-05-12T13:46:38Z

Here's without @avx on the first loop

julia> @code_warntype running_mean_std(randn(Float32, 10), 3)
Variables
  #self#::Core.Compiler.Const(SlidingDistancesBase.running_mean_std, false)
  x::Array{Float32,1}
  m::Int64
  val::Nothing
  n::Int64
  ss::Float32
  s::Float32
  μ::Array{Float32,1}
  σ::Array{Float32,1}
  @_10::Union{Nothing, Tuple{Int64,Int64}}
  @_11::Union{Nothing, Tuple{Int64,Int64}}
  i@_12::Int64
  i@_13::Int64

Body::Tuple{Array{Float32,1},Array{Float32,1}}
1 ──        Core.NewvarNode(:(val))
│           Core.NewvarNode(:(n))
│           Core.NewvarNode(:(ss))
│           Core.NewvarNode(:(s))
│           Core.NewvarNode(:(μ))
│           Core.NewvarNode(:(σ))
│           Core.NewvarNode(:(@_10))
│           Core.NewvarNode(:(@_11))
│    %9   = SlidingDistancesBase.length(x)::Int64
│    %10  = (%9 >= m)::Bool
└───        goto #3 if not %10
2 ──        goto #4
3 ── %13  = Base.AssertionError("length(x) >= m")::AssertionError
└───        Base.throw(%13)
4 ┄─ %15  = SlidingDistancesBase.length(x)::Int64
│    %16  = (%15 - m)::Int64
│           (n = %16 + 1)
│    %18  = SlidingDistancesBase.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│    %19  = SlidingDistancesBase.zero(%18)::Core.Compiler.Const(0.0f0, false)
│           (ss = %19)
│           (s = %19)
│    %22  = SlidingDistancesBase.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│    %23  = Core.apply_type(SlidingDistancesBase.Vector, %22)::Core.Compiler.Const(Array{Float32,1}, false)
│           (μ = (%23)(SlidingDistancesBase.undef, n))
│    %25  = SlidingDistancesBase.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│    %26  = Core.apply_type(SlidingDistancesBase.Vector, %25)::Core.Compiler.Const(Array{Float32,1}, false)
│           (σ = (%26)(SlidingDistancesBase.undef, n))
│    %28  = (1:m)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
│           (@_10 = Base.iterate(%28))
│    %30  = (@_10 === nothing)::Bool
│    %31  = Base.not_int(%30)::Bool
└───        goto #7 if not %31
5 ┄─ %33  = @_10::Tuple{Int64,Int64}::Tuple{Int64,Int64}
│           (i@_12 = Core.getfield(%33, 1))
│    %35  = Core.getfield(%33, 2)::Int64
│    %36  = s::Float32
│    %37  = Base.getindex(x, i@_12)::Float32
│           (s = %36 + %37)
│    %39  = ss::Float32
│    %40  = Base.getindex(x, i@_12)::Float32
│    %41  = Core.apply_type(Base.Val, 2)::Core.Compiler.Const(Val{2}, false)
│    %42  = (%41)()::Core.Compiler.Const(Val{2}(), false)
│    %43  = Base.literal_pow(SlidingDistancesBase.:^, %40, %42)::Float32
│           (ss = %39 + %43)
│           (@_10 = Base.iterate(%28, %35))
│    %46  = (@_10 === nothing)::Bool
│    %47  = Base.not_int(%46)::Bool
└───        goto #7 if not %47
6 ──        goto #5
7 ┄─ %50  = (s / m)::Float32
│           Base.setindex!(μ, %50, 1)
│    %52  = (ss / m)::Float32
│    %53  = Base.getindex(μ, 1)::Float32
│    %54  = Core.apply_type(Base.Val, 2)::Core.Compiler.Const(Val{2}, false)
│    %55  = (%54)()::Core.Compiler.Const(Val{2}(), false)
│    %56  = Base.literal_pow(SlidingDistancesBase.:^, %53, %55)::Float32
│    %57  = (%52 - %56)::Float32
│    %58  = SlidingDistancesBase.sqrt(%57)::Float32
│           Base.setindex!(σ, %58, 1)
│           $(Expr(:inbounds, true))
│    %61  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %62  = Base.getproperty(%61, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│    %63  = n::Int64
│    %64  = (%62)(%63, 1)::Int64
│    %65  = (1:%64)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
│           (@_11 = Base.iterate(%65))
│    %67  = (@_11 === nothing)::Bool
│    %68  = Base.not_int(%67)::Bool
└───        goto #10 if not %68
8 ┄─ %70  = @_11::Tuple{Int64,Int64}::Tuple{Int64,Int64}
│           (i@_13 = Core.getfield(%70, 1))
│    %72  = Core.getfield(%70, 2)::Int64
│    %73  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %74  = Base.getproperty(%73, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│    %75  = s::Float32
│    %76  = Base.getindex(x, i@_13)::Float32
│           (s = (%74)(%75, %76))
│    %78  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %79  = Base.getproperty(%78, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│    %80  = ss::Float32
│    %81  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %82  = Base.getproperty(%81, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│    %83  = Base.getindex(x, i@_13)::Float32
│    %84  = (%82)(%83, Val{2}())::Float32
│           (ss = (%79)(%80, %84))
│    %86  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %87  = Base.getproperty(%86, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %88  = s::Float32
│    %89  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %90  = Base.getproperty(%89, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %91  = i@_13::Int64
│    %92  = (%90)(%91, m)::Int64
│    %93  = Base.getindex(x, %92)::Float32
│           (s = (%87)(%88, %93))
│    %95  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %96  = Base.getproperty(%95, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %97  = ss::Float32
│    %98  = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %99  = Base.getproperty(%98, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│    %100 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %101 = Base.getproperty(%100, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %102 = i@_13::Int64
│    %103 = (%101)(%102, m)::Int64
│    %104 = Base.getindex(x, %103)::Float32
│    %105 = (%99)(%104, Val{2}())::Float32
│           (ss = (%96)(%97, %105))
│    %107 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %108 = Base.getproperty(%107, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│    %109 = s::Float32
│    %110 = (%108)(%109, m)::Float32
│    %111 = μ::Array{Float32,1}
│    %112 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %113 = Base.getproperty(%112, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %114 = i@_13::Int64
│    %115 = (%113)(%114, 1)::Int64
│           Base.setindex!(%111, %110, %115)
│    %117 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %118 = Base.getproperty(%117, :sqrt_fast)::Core.Compiler.Const(Base.FastMath.sqrt_fast, false)
│    %119 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %120 = Base.getproperty(%119, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│    %121 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %122 = Base.getproperty(%121, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│    %123 = ss::Float32
│    %124 = (%122)(%123, m)::Float32
│    %125 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %126 = Base.getproperty(%125, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│    %127 = μ::Array{Float32,1}
│    %128 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %129 = Base.getproperty(%128, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %130 = i@_13::Int64
│    %131 = (%129)(%130, 1)::Int64
│    %132 = Base.getindex(%127, %131)::Float32
│    %133 = (%126)(%132, Val{2}())::Float32
│    %134 = (%120)(%124, %133)::Float32
│    %135 = (%118)(%134)::Float32
│    %136 = σ::Array{Float32,1}
│    %137 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│    %138 = Base.getproperty(%137, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│    %139 = i@_13::Int64
│    %140 = (%138)(%139, 1)::Int64
│           Base.setindex!(%136, %135, %140)
│           (@_11 = Base.iterate(%65, %72))
│    %143 = (@_11 === nothing)::Bool
│    %144 = Base.not_int(%143)::Bool
└───        goto #10 if not %144
9 ──        goto #8
10 ┄        (val = nothing)
│           $(Expr(:inbounds, :pop))
│           val
│    %150 = Core.tuple(μ, σ)::Tuple{Array{Float32,1},Array{Float32,1}}
└───        return %150

chriselrod · 2020-05-12T13:50:43Z

The second loop throws errors, which is why I got inference failures there.

I did not get inference failures with Julia 1.1 or Julia master, but I'll try 1.4.
Still not seeing the inference issues:

julia> @code_warntype running_mean_std(randn(Float32, 10), 3) # Not fine
Variables
  #self#::Core.Compiler.Const(running_mean_std, false)
  x::Array{Float32,1}
  m::Int32
  ss_0::NTuple{16,VecElement{Float32}}
  s_0::NTuple{16,VecElement{Float32}}
  val@_6::Union{}
  val@_7::Nothing
  n::Int32
  μ::Array{Float32,1}
  σ::Array{Float32,1}
  vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
  @_12::Tuple{NTuple{16,VecElement{Float32}},NTuple{16,VecElement{Float32}}}
  @_13::Int32
  s::Float32
  ss::Float32
  @_16::Union{}
  @_17::Union{Nothing, Tuple{Int32,Int32}}
  i@_18::Union{}
  i@_19::Int32

Body::Tuple{Array{Float32,1},Array{Float32,1}}
1 ─        Core.NewvarNode(:(ss_0))
│          Core.NewvarNode(:(s_0))
│          Core.NewvarNode(:(val@_6))
│          Core.NewvarNode(:(val@_7))
│          Core.NewvarNode(:(n))
│          Core.NewvarNode(:(μ))
│          Core.NewvarNode(:(σ))
│          Core.NewvarNode(:(vptr##_x))
│          Core.NewvarNode(:(@_12))
│          Core.NewvarNode(:(@_13))
│          Core.NewvarNode(:(s))
│          Core.NewvarNode(:(ss))
│          Core.NewvarNode(:(@_16))
│          Core.NewvarNode(:(@_17))
│   %15  = Main.length(x)::Int32
│   %16  = (%15 >= m)::Bool
└──        goto #3 if not %16
2 ─        goto #4
3 ─ %19  = Base.AssertionError("length(x) >= m")::AssertionError
└──        Base.throw(%19)
4 ┄ %21  = Main.length(x)::Int32
│   %22  = (%21 - m)::Int32
│          (n = %22 + 1)
│   %24  = Main.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %25  = Main.zero(%24)::Core.Compiler.Const(0.0f0, false)
│          (ss = %25)
│          (s = %25)
│   %28  = Main.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %29  = Core.apply_type(Main.Vector, %28)::Core.Compiler.Const(Array{Float32,1}, false)
│          (μ = (%29)(Main.undef, n))
│   %31  = Main.float($(Expr(:static_parameter, 1)))::Core.Compiler.Const(Float32, false)
│   %32  = Core.apply_type(Main.Vector, %31)::Core.Compiler.Const(Array{Float32,1}, false)
│          (σ = (%32)(Main.undef, n))
│   %34  = LoopVectorization.check_args(x)::Core.Compiler.Const(true, false)
│          %34
│          (vptr##_x = LoopVectorization.stridedpointer(x))
│   %37  = $(Expr(:gc_preserve_begin, :(x)))
│   %38  = Core.apply_type(Main.Val, (0, 0, 0))::Core.Compiler.Const(Val{(0, 0, 0)}, false)
│   %39  = (%38)()::Core.Compiler.Const(Val{(0, 0, 0)}(), false)
│   %40  = Core.apply_type(Main.Tuple, :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x03), :LoopVectorization, :vabs2, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000002, LoopVectorization.compute, 0x00, 0x04), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000405, LoopVectorization.compute, 0x00, 0x03))::Core.Compiler.Const(Tuple{:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01),:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x03),:LoopVectorization,:vabs2,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000002, LoopVectorization.compute, 0x00, 0x04),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000405, LoopVectorization.compute, 0x00, 0x03)}, false)
│   %41  = Core.apply_type(Main.Tuple, LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000))::Core.Compiler.Const(Tuple{LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000)}, false)
│   %42  = Core.apply_type(Main.Tuple, 3, 6)::Core.Compiler.Const(Tuple{3,6}, false)
│   %43  = Core.apply_type(Main.Tuple, 1, 4)::Core.Compiler.Const(Tuple{1,4}, false)
│   %44  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %45  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %46  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %47  = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %48  = Core.apply_type(Main.Tuple, 0, %42, %43, %44, %45, %46, %47)::Core.Compiler.Const(Tuple{0,Tuple{3,6},Tuple{1,4},Tuple{},Tuple{},Tuple{},Tuple{}}, false)
│   %49  = Core.apply_type(Main.Tuple, :i)::Core.Compiler.Const(Tuple{:i}, false)
│   %50  = Core.apply_type(LoopVectorization.StaticLowerUnitRange, 1)::Core.Compiler.Const(VectorizationBase.StaticLowerUnitRange{1}, false)
│   %51  = (%50)(m)::VectorizationBase.StaticLowerUnitRange{1}
│   %52  = Core.tuple(%51)::Tuple{VectorizationBase.StaticLowerUnitRange{1}}
│   %53  = vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
│   %54  = s::Core.Compiler.Const(0.0f0, false)::Core.Compiler.Const(0.0f0, false)
│   %55  = LoopVectorization._avx_!(%39, %40, %41, %48, %49, %52, %53, %54, ss::Core.Compiler.Const(0.0f0, false))::Tuple{NTuple{16,VecElement{Float32}},NTuple{16,VecElement{Float32}}}
│   %56  = Base.indexed_iterate(%55, 1)::Core.Compiler.PartialStruct(Tuple{NTuple{16,VecElement{Float32}},Int32}, Any[NTuple{16,VecElement{Float32}}, Core.Compiler.Const(2, false)])
│          (s_0 = Core.getfield(%56, 1))
│          (@_13 = Core.getfield(%56, 2))
│   %59  = Base.indexed_iterate(%55, 2, @_13::Core.Compiler.Const(2, false))::Core.Compiler.PartialStruct(Tuple{NTuple{16,VecElement{Float32}},Int32}, Any[NTuple{16,VecElement{Float32}}, Core.Compiler.Const(3, false)])
│          (ss_0 = Core.getfield(%59, 1))
│          (@_12 = %55)
│          $(Expr(:gc_preserve_end, :(%37)))
│          @_12
│          (s = LoopVectorization.reduced_add(s_0, s::Core.Compiler.Const(0.0f0, false)))
│          (ss = LoopVectorization.reduced_add(ss_0, ss::Core.Compiler.Const(0.0f0, false)))
└──        goto #6
5 ─        $(Expr(:inbounds, true))
│          Core.Compiler.Const(:(1:m), false)
│          Core.Compiler.Const(:(@_16 = Base.iterate(%68)), false)
│          Core.Compiler.Const(:(@_16 === nothing), false)
│          Core.Compiler.Const(:(Base.not_int(%70)), false)
│          Core.Compiler.Const(:(%71), false)
│          Core.Compiler.Const(:(@_16), false)
│          Core.Compiler.Const(:(i@_18 = Core.getfield(%73, 1)), false)
│          Core.Compiler.Const(:(Core.getfield(%73, 2)), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%76, :add_fast)), false)
│          Core.Compiler.Const(:(s), false)
│          Core.Compiler.Const(:(Base.getindex(x, i@_18)), false)
│          Core.Compiler.Const(:(s = (%77)(%78, %79)), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%81, :add_fast)), false)
│          Core.Compiler.Const(:(ss), false)
│          Core.Compiler.Const(:(Base.FastMath), false)
│          Core.Compiler.Const(:(Base.getproperty(%84, :pow_fast)), false)
│          Core.Compiler.Const(:(Base.getindex(x, i@_18)), false)
│          Core.Compiler.Const(:((%85)(%86, Val{2}())), false)
│          Core.Compiler.Const(:(ss = (%82)(%83, %87)), false)
│          Core.Compiler.Const(:(@_16 = Base.iterate(%68, %75)), false)
│          Core.Compiler.Const(:(@_16 === nothing), false)
│          Core.Compiler.Const(:(Base.not_int(%90)), false)
│          Core.Compiler.Const(:(%91), false)
│          Core.Compiler.Const(:(goto %73), false)
│          Core.Compiler.Const(:(val@_6 = nothing), false)
│          $(Expr(:inbounds, :pop))
└──        Core.Compiler.Const(:(val@_6), false)
6 ┄ %97  = (s / m)::Float32
│          Base.setindex!(μ, %97, 1)
│   %99  = (ss / m)::Float32
│   %100 = Base.getindex(μ, 1)::Float32
│   %101 = Core.apply_type(Base.Val, 2)::Core.Compiler.Const(Val{2}, false)
│   %102 = (%101)()::Core.Compiler.Const(Val{2}(), false)
│   %103 = Base.literal_pow(Main.:^, %100, %102)::Float32
│   %104 = (%99 - %103)::Float32
│   %105 = Main.sqrt(%104)::Float32
│          Base.setindex!(σ, %105, 1)
│          $(Expr(:inbounds, true))
│   %108 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %109 = Base.getproperty(%108, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %110 = n::Int32
│   %111 = (%109)(%110, 1)::Int32
│   %112 = (1:%111)::Core.Compiler.PartialStruct(UnitRange{Int32}, Any[Core.Compiler.Const(1, false), Int32])
│          (@_17 = Base.iterate(%112))
│   %114 = (@_17 === nothing)::Bool
│   %115 = Base.not_int(%114)::Bool
└──        goto #9 if not %115
7 ┄ %117 = @_17::Tuple{Int32,Int32}::Tuple{Int32,Int32}
│          (i@_19 = Core.getfield(%117, 1))
│   %119 = Core.getfield(%117, 2)::Int32
│   %120 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %121 = Base.getproperty(%120, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %122 = s::Float32
│   %123 = Base.getindex(x, i@_19)::Float32
│          (s = (%121)(%122, %123))
│   %125 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %126 = Base.getproperty(%125, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %127 = ss::Float32
│   %128 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %129 = Base.getproperty(%128, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %130 = Base.getindex(x, i@_19)::Float32
│   %131 = (%129)(%130, Val{2}())::Float32
│          (ss = (%126)(%127, %131))
│   %133 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %134 = Base.getproperty(%133, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %135 = s::Float32
│   %136 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %137 = Base.getproperty(%136, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %138 = i@_19::Int32
│   %139 = (%137)(%138, m)::Int32
│   %140 = Base.getindex(x, %139)::Float32
│          (s = (%134)(%135, %140))
│   %142 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %143 = Base.getproperty(%142, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %144 = ss::Float32
│   %145 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %146 = Base.getproperty(%145, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %147 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %148 = Base.getproperty(%147, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %149 = i@_19::Int32
│   %150 = (%148)(%149, m)::Int32
│   %151 = Base.getindex(x, %150)::Float32
│   %152 = (%146)(%151, Val{2}())::Float32
│          (ss = (%143)(%144, %152))
│   %154 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %155 = Base.getproperty(%154, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│   %156 = s::Float32
│   %157 = (%155)(%156, m)::Float32
│   %158 = μ::Array{Float32,1}
│   %159 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %160 = Base.getproperty(%159, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %161 = i@_19::Int32
│   %162 = (%160)(%161, 1)::Int32
│          Base.setindex!(%158, %157, %162)
│   %164 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %165 = Base.getproperty(%164, :sqrt_fast)::Core.Compiler.Const(Base.FastMath.sqrt_fast, false)
│   %166 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %167 = Base.getproperty(%166, :sub_fast)::Core.Compiler.Const(Base.FastMath.sub_fast, false)
│   %168 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %169 = Base.getproperty(%168, :div_fast)::Core.Compiler.Const(Base.FastMath.div_fast, false)
│   %170 = ss::Float32
│   %171 = (%169)(%170, m)::Float32
│   %172 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %173 = Base.getproperty(%172, :pow_fast)::Core.Compiler.Const(Base.FastMath.pow_fast, false)
│   %174 = μ::Array{Float32,1}
│   %175 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %176 = Base.getproperty(%175, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %177 = i@_19::Int32
│   %178 = (%176)(%177, 1)::Int32
│   %179 = Base.getindex(%174, %178)::Float32
│   %180 = (%173)(%179, Val{2}())::Float32
│   %181 = (%167)(%171, %180)::Float32
│   %182 = (%165)(%181)::Float32
│   %183 = σ::Array{Float32,1}
│   %184 = Base.FastMath::Core.Compiler.Const(Base.FastMath, false)
│   %185 = Base.getproperty(%184, :add_fast)::Core.Compiler.Const(Base.FastMath.add_fast, false)
│   %186 = i@_19::Int32
│   %187 = (%185)(%186, 1)::Int32
│          Base.setindex!(%183, %182, %187)
│          (@_17 = Base.iterate(%112, %119))
│   %190 = (@_17 === nothing)::Bool
│   %191 = Base.not_int(%190)::Bool
└──        goto #9 if not %191
8 ─        goto #7
9 ┄        (val@_7 = nothing)
│          $(Expr(:inbounds, :pop))
│          val@_7
│   %197 = Core.tuple(μ, σ)::Tuple{Array{Float32,1},Array{Float32,1}}
└──        return %197

julia> versioninfo()
Julia Version 1.4.1
Commit 381693d3df* (2020-04-14 17:20 UTC)
Platform Info:
  OS: Linux (i686-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i9-7900X CPU @ 3.30GHz
  WORD_SIZE: 32
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, skylake)

baggepinnen · 2020-05-12T13:55:57Z

I get the problem with this shorter version as well

function running_mean_std(x::AbstractArray{T}, m) where T
    @assert length(x) >= m
    n = length(x)-m+1
    s = ss = zero(float(T))
    μ = Vector{float(T)}(undef, n)
    σ = Vector{float(T)}(undef, n)
    @avx for i = 1:m # TODO: change to @avx after https://github.com/chriselrod/LoopVectorization.jl/issues/114
        s  += x[i]
        ss += x[i]^2
    end
    # μ[1] = s/m
    # σ[1] = sqrt(max(ss/m - μ[1]^2, 0))
    # @fastmath @inbounds for i = 1:n-1 # fastmath making it more accurate here as well, but not faster
    #     s -= x[i]
    #     ss -= x[i]^2
    #     s += x[i+m]
    #     ss += x[i+m]^2
    #     μ[i+1] = s/m
    #     σ[i+1] = sqrt(max(ss/m - μ[i+1]^2, 0))
    # end
    μ,σ
end

julia> @code_warntype running_mean_std(randn(Float32, 10), 3)
Variables
  #self#::Core.Compiler.Const(SlidingDistancesBase.running_mean_std, false)
  x::Array{Float32,1}
  m::Int64
  ss_0::Any
  s_0::Any
  val::Union{}
  n::Int64
  μ::Array{Float32,1}
  σ::Array{Float32,1}
  vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
  @_11::Tuple{Any,Any}
  @_12::Int64
  s::Any
  ss::Any
  @_15::Union{}
  i::Union{}

the accumulators s,ss are inferred to Any

chriselrod · 2020-05-12T13:57:59Z

Perhaps the problem is specific to code gen on your architecture.

Could you use Cthulhu and @descend_code_warntype to descend into the

LoopVectorization._avx_!(%39, %40, %41, %48, %49, %52, %53, %54, ss::Core.Compiler.Const(0.0f0, false))::Tuple{Any,Any}

call, and post the warntype info from that?

If I can see where the type instability is being introduced, I can hopefully figure out what is causing it even though I still haven't been able to reproduce it locally.

baggepinnen · 2020-05-12T14:01:16Z

I don't seem to get a call to _avx_!, here's the options

  %28  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}

│ ─ %-1  = invoke running_mean_std(::Array{Float32,1},::Int64)::Tuple{Array{Float32,1},Array{Float32,1}}
Body::Tuple{Array{Float32,1},Array{Float32,1}}
1 ── %1   = Base.arraylen(x)::Int64
│    %2   = Base.sle_int(m, %1)::Bool
└───        goto #23 if not %2
2 ── %4   = Base.arraylen(x)::Int64
│    %5   = Base.sub_int(%4, m)::Int64
│    %6   = Base.add_int(%5, 1)::Int64
│    %7   = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Float32,1}, svec(Any, Int64), 0, :(:ccall), Array{Float32,1}, :(%6), :(%6)))::Array{Float32,1}
│    %8   = $(Expr(:foreigncall, :(:jl_alloc_array_1d), Array{Float32,1}, svec(Any, Int64), 0, :(:ccall), Array{Float32,1}, :(%6), :(%6)))::Array{Float32,1}
│    %9   = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float32}, svec(Any), 0, :(:ccall), :(x)))::Ptr{Float32}
│           Base.arraysize(x, 1)
│    %11  = $(Expr(:gc_preserve_begin, :(x)))
│    %12  = Base.trunc_int(UInt8, m)::UInt8
│    %13  = Base.sub_int(%12, 0x01)::UInt8
│    %14  = Base.and_int(%13, 0x07)::UInt8
│    %15  = Base.add_int(%14, 0x01)::UInt8
│    %16  = Base.sub_int(0x08, %15)::UInt8
│    %17  = Base.lshr_int(0xff, %16)::UInt8
│    %18  = %new(VectorizationBase.Mask{8,UInt8}, %17)::VectorizationBase.Mask{8,UInt8}
│    %19  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %20  = VectorizationBase.SVec(%19)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %21  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %22  = VectorizationBase.SVec(%21)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %23  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %24  = VectorizationBase.SVec(%23)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %25  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %26  = VectorizationBase.SVec(%25)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %27  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %28  = VectorizationBase.SVec(%27)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %29  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %30  = VectorizationBase.SVec(%29)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %31  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %32  = VectorizationBase.SVec(%31)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %33  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
└─── %34  = VectorizationBase.SVec(%33)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
3 ┄─ %35  = φ (#2 => %20, #4 => %86)::Any
│    %36  = φ (#2 => %22, #4 => %87)::Any
│    %37  = φ (#2 => %24, #4 => %88)::Any
│    %38  = φ (#2 => %26, #4 => %89)::Any
│    %39  = φ (#2 => %28, #4 => %102)::Any
│    %40  = φ (#2 => %30, #4 => %103)::Any
│    %41  = φ (#2 => %32, #4 => %104)::Any
│    %42  = φ (#2 => %34, #4 => %105)::Any
│    %43  = φ (#2 => 1, #4 => %106)::Int64
│    %44  = φ (#2 => 1, #4 => %106)::Int64
│    %45  = φ (#2 => 1, #4 => %106)::Int64
│    %46  = φ (#2 => 1, #4 => %106)::Int64
│    %47  = φ (#2 => 1, #4 => %106)::Int64
│    %48  = φ (#2 => 1, #4 => %106)::Int64
│    %49  = φ (#2 => 1, #4 => %106)::Int64
│    %50  = φ (#2 => 1, #4 => %106)::Int64
│    %51  = φ (#2 => 1, #4 => %106)::Int64
│    %52  = φ (#2 => 1, #4 => %106)::Int64
│    %53  = φ (#2 => 1, #4 => %106)::Int64
│    %54  = φ (#2 => 1, #4 => %106)::Int64
│    %55  = φ (#2 => 1, #4 => %106)::Int64
│    %56  = φ (#2 => 1, #4 => %106)::Int64
│    %57  = φ (#2 => 1, #4 => %106)::Int64
│    %58  = φ (#2 => 1, #4 => %106)::Int64
│    %59  = φ (#2 => 1, #4 => %106)::Int64
│    %60  = φ (#2 => 1, #4 => %106)::Int64
│    %61  = φ (#2 => 1, #4 => %106)::Int64
│    %62  = φ (#2 => 1, #4 => %106)::Int64
│    %63  = φ (#2 => 1, #4 => %106)::Int64
│    %64  = Base.sub_int(m, 30)::Int64
│    %65  = Base.slt_int(%43, %64)::Bool
└───        goto #5 if not %65
4 ── %67  = Base.sub_int(%44, 1)::Int64
│    %68  = Base.llvmcall::Core.IntrinsicFunction
│    %69  = (%68)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %67)::NTuple{8,VecElement{Float32}}
│    %70  = %new(VectorizationBase.SVec{8,Float32}, %69)::VectorizationBase.SVec{8,Float32}
│    %71  = Base.add_int(8, %45)::Int64
│    %72  = Base.sub_int(%71, 1)::Int64
│    %73  = Base.llvmcall::Core.IntrinsicFunction
│    %74  = (%73)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %72)::NTuple{8,VecElement{Float32}}
│    %75  = %new(VectorizationBase.SVec{8,Float32}, %74)::VectorizationBase.SVec{8,Float32}
│    %76  = Base.add_int(16, %46)::Int64
│    %77  = Base.sub_int(%76, 1)::Int64
│    %78  = Base.llvmcall::Core.IntrinsicFunction
│    %79  = (%78)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %77)::NTuple{8,VecElement{Float32}}
│    %80  = %new(VectorizationBase.SVec{8,Float32}, %79)::VectorizationBase.SVec{8,Float32}
│    %81  = Base.add_int(24, %47)::Int64
│    %82  = Base.sub_int(%81, 1)::Int64
│    %83  = Base.llvmcall::Core.IntrinsicFunction
│    %84  = (%83)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %82)::NTuple{8,VecElement{Float32}}
│    %85  = %new(VectorizationBase.SVec{8,Float32}, %84)::VectorizationBase.SVec{8,Float32}
│    %86  = LoopVectorization.vadd(%35, %70)::Any
│    %87  = LoopVectorization.vadd(%36, %75)::Any
│    %88  = LoopVectorization.vadd(%37, %80)::Any
│    %89  = LoopVectorization.vadd(%38, %85)::Any
│    %90  = Base.llvmcall::Core.IntrinsicFunction
│    %91  = (%90)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %69, %69)::NTuple{8,VecElement{Float32}}
│    %92  = %new(VectorizationBase.SVec{8,Float32}, %91)::VectorizationBase.SVec{8,Float32}
│    %93  = Base.llvmcall::Core.IntrinsicFunction
│    %94  = (%93)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %74, %74)::NTuple{8,VecElement{Float32}}
│    %95  = %new(VectorizationBase.SVec{8,Float32}, %94)::VectorizationBase.SVec{8,Float32}
│    %96  = Base.llvmcall::Core.IntrinsicFunction
│    %97  = (%96)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %79, %79)::NTuple{8,VecElement{Float32}}
│    %98  = %new(VectorizationBase.SVec{8,Float32}, %97)::VectorizationBase.SVec{8,Float32}
│    %99  = Base.llvmcall::Core.IntrinsicFunction
│    %100 = (%99)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %84, %84)::NTuple{8,VecElement{Float32}}
│    %101 = %new(VectorizationBase.SVec{8,Float32}, %100)::VectorizationBase.SVec{8,Float32}
│    %102 = LoopVectorization.vadd(%39, %92)::Any
│    %103 = LoopVectorization.vadd(%40, %95)::Any
│    %104 = LoopVectorization.vadd(%41, %98)::Any
│    %105 = LoopVectorization.vadd(%42, %101)::Any
│    %106 = Base.add_int(%48, 32)::Int64
└───        goto #3
5 ── %108 = Base.slt_int(m, %49)::Bool
└───        goto #7 if not %108
6 ──        goto #15
7 ── %111 = Base.sub_int(m, 8)::Int64
│    %112 = Base.slt_int(%111, %50)::Bool
└───        goto #9 if not %112
8 ── %114 = Base.sub_int(%51, 1)::Int64
│    %115 = Base.llvmcall::Core.IntrinsicFunction
│    %116 = (%115)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %9, %114, %17)::NTuple{8,VecElement{Float32}}
│    %117 = %new(VectorizationBase.SVec{8,Float32}, %116)::VectorizationBase.SVec{8,Float32}
│    %118 = LoopVectorization.vadd(%35, %117)::Any
│    %119 = LoopVectorization.vifelse(%18, %118, %35)::Any
│    %120 = Base.llvmcall::Core.IntrinsicFunction
│    %121 = (%120)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %116, %116)::NTuple{8,VecElement{Float32}}
│    %122 = %new(VectorizationBase.SVec{8,Float32}, %121)::VectorizationBase.SVec{8,Float32}
│    %123 = LoopVectorization.vadd(%39, %122)::Any
│    %124 = LoopVectorization.vifelse(%18, %123, %39)::Any
└───        goto #15
9 ── %126 = Base.sub_int(m, 16)::Int64
│    %127 = Base.slt_int(%126, %52)::Bool
└───        goto #11 if not %127
10 ─ %129 = Base.sub_int(%53, 1)::Int64
│    %130 = Base.llvmcall::Core.IntrinsicFunction
│    %131 = (%130)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %129)::NTuple{8,VecElement{Float32}}
│    %132 = %new(VectorizationBase.SVec{8,Float32}, %131)::VectorizationBase.SVec{8,Float32}
│    %133 = Base.add_int(8, %54)::Int64
│    %134 = Base.sub_int(%133, 1)::Int64
│    %135 = Base.llvmcall::Core.IntrinsicFunction
│    %136 = (%135)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %9, %134, %17)::NTuple{8,VecElement{Float32}}
│    %137 = %new(VectorizationBase.SVec{8,Float32}, %136)::VectorizationBase.SVec{8,Float32}
│    %138 = LoopVectorization.vadd(%35, %132)::Any
│    %139 = LoopVectorization.vadd(%36, %137)::Any
│    %140 = LoopVectorization.vifelse(%18, %139, %36)::Any
│    %141 = Base.llvmcall::Core.IntrinsicFunction
│    %142 = (%141)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %131, %131)::NTuple{8,VecElement{Float32}}
│    %143 = %new(VectorizationBase.SVec{8,Float32}, %142)::VectorizationBase.SVec{8,Float32}
│    %144 = Base.llvmcall::Core.IntrinsicFunction
│    %145 = (%144)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %136, %136)::NTuple{8,VecElement{Float32}}
│    %146 = %new(VectorizationBase.SVec{8,Float32}, %145)::VectorizationBase.SVec{8,Float32}
│    %147 = LoopVectorization.vadd(%39, %143)::Any
│    %148 = LoopVectorization.vadd(%40, %146)::Any
│    %149 = LoopVectorization.vifelse(%18, %148, %40)::Any
└───        goto #15
11 ─ %151 = Base.sub_int(m, 24)::Int64
│    %152 = Base.slt_int(%151, %55)::Bool
└───        goto #13 if not %152
12 ─ %154 = Base.sub_int(%56, 1)::Int64
│    %155 = Base.llvmcall::Core.IntrinsicFunction
│    %156 = (%155)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %154)::NTuple{8,VecElement{Float32}}
│    %157 = %new(VectorizationBase.SVec{8,Float32}, %156)::VectorizationBase.SVec{8,Float32}
│    %158 = Base.add_int(8, %57)::Int64
│    %159 = Base.sub_int(%158, 1)::Int64
│    %160 = Base.llvmcall::Core.IntrinsicFunction
│    %161 = (%160)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %159)::NTuple{8,VecElement{Float32}}
│    %162 = %new(VectorizationBase.SVec{8,Float32}, %161)::VectorizationBase.SVec{8,Float32}
│    %163 = Base.add_int(16, %58)::Int64
│    %164 = Base.sub_int(%163, 1)::Int64
│    %165 = Base.llvmcall::Core.IntrinsicFunction
│    %166 = (%165)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %9, %164, %17)::NTuple{8,VecElement{Float32}}
│    %167 = %new(VectorizationBase.SVec{8,Float32}, %166)::VectorizationBase.SVec{8,Float32}
│    %168 = LoopVectorization.vadd(%35, %157)::Any
│    %169 = LoopVectorization.vadd(%36, %162)::Any
│    %170 = LoopVectorization.vadd(%37, %167)::Any
│    %171 = LoopVectorization.vifelse(%18, %170, %37)::Any
│    %172 = Base.llvmcall::Core.IntrinsicFunction
│    %173 = (%172)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %156, %156)::NTuple{8,VecElement{Float32}}
│    %174 = %new(VectorizationBase.SVec{8,Float32}, %173)::VectorizationBase.SVec{8,Float32}
│    %175 = Base.llvmcall::Core.IntrinsicFunction
│    %176 = (%175)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %161, %161)::NTuple{8,VecElement{Float32}}
│    %177 = %new(VectorizationBase.SVec{8,Float32}, %176)::VectorizationBase.SVec{8,Float32}
│    %178 = Base.llvmcall::Core.IntrinsicFunction
│    %179 = (%178)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %166, %166)::NTuple{8,VecElement{Float32}}
│    %180 = %new(VectorizationBase.SVec{8,Float32}, %179)::VectorizationBase.SVec{8,Float32}
│    %181 = LoopVectorization.vadd(%39, %174)::Any
│    %182 = LoopVectorization.vadd(%40, %177)::Any
│    %183 = LoopVectorization.vadd(%41, %180)::Any
│    %184 = LoopVectorization.vifelse(%18, %183, %41)::Any
└───        goto #15
13 ─ %186 = Base.sub_int(m, 32)::Int64
│    %187 = Base.slt_int(%186, %59)::Bool
└───        goto #15 if not %187
14 ─ %189 = Base.sub_int(%60, 1)::Int64
│    %190 = Base.llvmcall::Core.IntrinsicFunction
│    %191 = (%190)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %189)::NTuple{8,VecElement{Float32}}
│    %192 = %new(VectorizationBase.SVec{8,Float32}, %191)::VectorizationBase.SVec{8,Float32}
│    %193 = Base.add_int(8, %61)::Int64
│    %194 = Base.sub_int(%193, 1)::Int64
│    %195 = Base.llvmcall::Core.IntrinsicFunction
│    %196 = (%195)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %194)::NTuple{8,VecElement{Float32}}
│    %197 = %new(VectorizationBase.SVec{8,Float32}, %196)::VectorizationBase.SVec{8,Float32}
│    %198 = Base.add_int(16, %62)::Int64
│    %199 = Base.sub_int(%198, 1)::Int64
│    %200 = Base.llvmcall::Core.IntrinsicFunction
│    %201 = (%200)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %9, %199)::NTuple{8,VecElement{Float32}}
│    %202 = %new(VectorizationBase.SVec{8,Float32}, %201)::VectorizationBase.SVec{8,Float32}
│    %203 = Base.add_int(24, %63)::Int64
│    %204 = Base.sub_int(%203, 1)::Int64
│    %205 = Base.llvmcall::Core.IntrinsicFunction
│    %206 = (%205)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %9, %204, %17)::NTuple{8,VecElement{Float32}}
│    %207 = %new(VectorizationBase.SVec{8,Float32}, %206)::VectorizationBase.SVec{8,Float32}
│    %208 = LoopVectorization.vadd(%35, %192)::Any
│    %209 = LoopVectorization.vadd(%36, %197)::Any
│    %210 = LoopVectorization.vadd(%37, %202)::Any
│    %211 = LoopVectorization.vadd(%38, %207)::Any
│    %212 = LoopVectorization.vifelse(%18, %211, %38)::Any
│    %213 = Base.llvmcall::Core.IntrinsicFunction
│    %214 = (%213)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %191, %191)::NTuple{8,VecElement{Float32}}
│    %215 = %new(VectorizationBase.SVec{8,Float32}, %214)::VectorizationBase.SVec{8,Float32}
│    %216 = Base.llvmcall::Core.IntrinsicFunction
│    %217 = (%216)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %196, %196)::NTuple{8,VecElement{Float32}}
│    %218 = %new(VectorizationBase.SVec{8,Float32}, %217)::VectorizationBase.SVec{8,Float32}
│    %219 = Base.llvmcall::Core.IntrinsicFunction
│    %220 = (%219)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %201, %201)::NTuple{8,VecElement{Float32}}
│    %221 = %new(VectorizationBase.SVec{8,Float32}, %220)::VectorizationBase.SVec{8,Float32}
│    %222 = Base.llvmcall::Core.IntrinsicFunction
│    %223 = (%222)(("", "%res = fmul fast <8 x float> %0, %1\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{NTuple{8,VecElement{Float32}},NTuple{8,VecElement{Float32}}}, %206, %206)::NTuple{8,VecElement{Float32}}
│    %224 = %new(VectorizationBase.SVec{8,Float32}, %223)::VectorizationBase.SVec{8,Float32}
│    %225 = LoopVectorization.vadd(%39, %215)::Any
│    %226 = LoopVectorization.vadd(%40, %218)::Any
│    %227 = LoopVectorization.vadd(%41, %221)::Any
│    %228 = LoopVectorization.vadd(%42, %224)::Any
└─── %229 = LoopVectorization.vifelse(%18, %228, %42)::Any
15 ┄ %230 = φ (#6 => %35, #8 => %119, #10 => %138, #12 => %168, #14 => %208, #13 => %35)::Any
│    %231 = φ (#6 => %36, #8 => %36, #10 => %140, #12 => %169, #14 => %209, #13 => %36)::Any
│    %232 = φ (#6 => %37, #8 => %37, #10 => %37, #12 => %171, #14 => %210, #13 => %37)::Any
│    %233 = φ (#6 => %38, #8 => %38, #10 => %38, #12 => %38, #14 => %212, #13 => %38)::Any
│    %234 = φ (#6 => %39, #8 => %124, #10 => %147, #12 => %181, #14 => %225, #13 => %39)::Any
│    %235 = φ (#6 => %40, #8 => %40, #10 => %149, #12 => %182, #14 => %226, #13 => %40)::Any
│    %236 = φ (#6 => %41, #8 => %41, #10 => %41, #12 => %184, #14 => %227, #13 => %41)::Any
│    %237 = φ (#6 => %42, #8 => %42, #10 => %42, #12 => %42, #14 => %229, #13 => %42)::Any
│    %238 = LoopVectorization.evadd(%230, %232)::Any
│    %239 = LoopVectorization.evadd(%231, %233)::Any
│    %240 = LoopVectorization.evadd(%238, %239)::Any
│    %241 = LoopVectorization.evadd(%234, %236)::Any
│    %242 = LoopVectorization.evadd(%235, %237)::Any
│    %243 = LoopVectorization.evadd(%241, %242)::Any
│    %244 = LoopVectorization.extract_data(%240)::Any
│    %245 = LoopVectorization.extract_data(%243)::Any
└───        goto #16
16 ─        $(Expr(:gc_preserve_end, :(%11)))
│    %248 = LoopVectorization.reduced_add::Core.Compiler.Const(SIMDPirates.reduced_add, false)
│    %249 = (isa)(%244, Float32)::Bool
└───        goto #18 if not %249
17 ─ %251 = π (%244, Float32)
│           invoke %248(%251::Float32, 0.0f0::Float32)
└───        goto #19
18 ─        LoopVectorization.reduced_add(%244, 0.0f0)
└───        goto #19
19 ┄ %256 = LoopVectorization.reduced_add::Core.Compiler.Const(SIMDPirates.reduced_add, false)
│    %257 = (isa)(%245, Float32)::Bool
└───        goto #21 if not %257
20 ─ %259 = π (%245, Float32)
│           invoke %256(%259::Float32, 0.0f0::Float32)
└───        goto #22
21 ─        LoopVectorization.reduced_add(%245, 0.0f0)
└───        goto #22
22 ┄ %264 = Core.tuple(%7, %8)::Tuple{Array{Float32,1},Array{Float32,1}}
└───        return %264
23 ─ %266 = %new(Core.AssertionError, "length(x) >= m")::AssertionError
│           Base.throw(%266)
└───        $(Expr(:unreachable))

Select a call to descend into or ↩ to ascend. [q]uit.
Toggles: [o]ptimize, [w]arn, [d]ebuginfo, [s]yntax highlight for Source/LLVM/Native.
Show: [S]ource code, [A]ST, [L]LVM IR, [N]ative code
Advanced: dump [P]arams cache.

 • %19  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %20  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %21  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %22  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %23  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %24  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %25  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %26  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %27  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
v  %28  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}

chriselrod · 2020-05-12T14:09:10Z

I don't seem to get a call to avx!, here's the options

I copied and pasted the call with return type ::Tuple{Any,Any} from your bad warntype, but seems like descend took you there.

This is weird:

%19  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any

Could you try:

julia> using LoopVectorization: VectorizationBase

julia> @code_warntype VectorizationBase.vzero(NTuple{8,VecElement{Float32}})
Variables
  #self#::Core.Compiler.Const(VectorizationBase.vzero, false)
  #unused#::Core.Compiler.Const(NTuple{8,VecElement{Float32}}, false)

Body::NTuple{8,VecElement{Float32}}
1 ─      nothing
│   %2 = Base.llvmcall::Core.Compiler.Const(Core.Intrinsics.llvmcall, false)
│   %3 = Core.apply_type(VectorizationBase.Vec, 8, Float32)::Core.Compiler.Const(NTuple{8,VecElement{Float32}}, false)
│   %4 = Core.apply_type(VectorizationBase.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %5 = (%2)("ret <8 x float> zeroinitializer\n", %3, %4)::NTuple{8,VecElement{Float32}}
└──      return %5

julia> @which VectorizationBase.vzero(NTuple{8,VecElement{Float32}})
vzero(::Type{Tuple{Vararg{VecElement{T},W}}}) where {W, T} in VectorizationBase at /home/chriselrod/.julia/dev/VectorizationBase/src/VectorizationBase.jl:119

baggepinnen · 2020-05-12T14:11:47Z

julia> using LoopVectorization: VectorizationBase

julia> @code_warntype VectorizationBase.vzero(NTuple{8,VecElement{Float32}})
Variables
  #self#::Core.Compiler.Const(VectorizationBase.vzero, false)
  #unused#::Core.Compiler.Const(NTuple{8,VecElement{Float32}}, false)

Body::NTuple{8,VecElement{Float32}}
1 ─      nothing
│   %2 = Base.llvmcall::Core.Compiler.Const(Core.Intrinsics.llvmcall, false)
│   %3 = Core.apply_type(VectorizationBase.Vec, 8, Float32)::Core.Compiler.Const(NTuple{8,VecElement{Float32}}, false)
│   %4 = Core.apply_type(VectorizationBase.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %5 = (%2)("ret <8 x float> zeroinitializer\n", %3, %4)::NTuple{8,VecElement{Float32}}
└──      return %5

julia> @which VectorizationBase.vzero(NTuple{8,VecElement{Float32}})
vzero(::Type{Tuple{Vararg{VecElement{T},W}}}) where {W, T} in VectorizationBase at /home/fredrikb/.julia/packages/VectorizationBase/WPwnU/src/VectorizationBase.jl:119

chriselrod · 2020-05-12T14:12:38Z

Inside your function, it was ::Any, but by itself it infers correctly?

I don't know what to make of that.

julia> foo(::Type{T}) where {T} = VectorizationBase.vzero(T)
foo (generic function with 1 method)

julia> @code_warntype foo(NTuple{8,VecElement{Float32}})
Variables
  #self#::Core.Compiler.Const(foo, false)
  #unused#::Core.Compiler.Const(NTuple{8,VecElement{Float32}}, false)

Body::NTuple{8,VecElement{Float32}}
1 ─ %1 = VectorizationBase.vzero::Core.Compiler.Const(VectorizationBase.vzero, false)
│   %2 = (%1)($(Expr(:static_parameter, 1)))::NTuple{8,VecElement{Float32}}
└──      return %2


julia> bar(T) = VectorizationBase.vzero(T)
bar (generic function with 1 method)

julia> @code_warntype bar(NTuple{8,VecElement{Float32}})
Variables
  #self#::Core.Compiler.Const(bar, false)
  T::Core.Compiler.Const(NTuple{8,VecElement{Float32}}, false)

Body::NTuple{8,VecElement{Float32}}
1 ─ %1 = VectorizationBase.vzero::Core.Compiler.Const(VectorizationBase.vzero, false)
│   %2 = (%1)(T)::NTuple{8,VecElement{Float32}}
└──      return %2

Could you try and get a minimal example?

baggepinnen · 2020-05-12T14:16:28Z

Hmm, me neither :/
I'm not sure if it's related to other recent inference issues. I experienced this strange behavior
JuliaLang/julia#35537
and there is one other
JuliaLang/julia#35800

baggepinnen · 2020-05-12T14:16:50Z

I'll see how minimal I can make the example

chriselrod · 2020-05-12T14:22:05Z

Thanks for the linked issues.
This would be an example without broadcast or mapfold machinery.

I can reproduce your inference failures in those issues, but not this one.
One difference between versions we're using is that my 1.4.1 is a 32-bit build (for debugging 32-bit Appveyor locally).

baggepinnen · 2020-05-12T14:30:25Z

If I hardcode 3 instead of m in this example, the problem goes away

using LoopVectorization
function f(x::AbstractArray{T}, m) where T
    s = zero(T)
    @avx for i = 1:3
        s  += x[i]
    end
end

@code_warntype f(randn(Float32, 10), 3)

With m as the loop length, I get

Variables
  #self#::Core.Compiler.Const(f, false)
  x::Array{Float32,1}
  m::Int64
  s_0::Any
  val::Union{}
  vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
  @_7::Any
  s::Any
  @_9::Union{}
  i::Union{}

Body::Any
1 ─       Core.NewvarNode(:(s_0))
│         Core.NewvarNode(:(val))
│         Core.NewvarNode(:(vptr##_x))
│         Core.NewvarNode(:(@_7))
│         Core.NewvarNode(:(@_9))
│         (s = Main.zero($(Expr(:static_parameter, 1))))
│   %7  = LoopVectorization.check_args(x)::Core.Compiler.Const(true, false)
│         %7
│         (vptr##_x = LoopVectorization.stridedpointer(x))
│   %10 = $(Expr(:gc_preserve_begin, :(x)))
│   %11 = Core.apply_type(Main.Val, (0, 0, 0))::Core.Compiler.Const(Val{(0, 0, 0)}, false)
│   %12 = (%11)()::Core.Compiler.Const(Val{(0, 0, 0)}(), false)
│   %13 = Core.apply_type(Main.Tuple, :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02), :LoopVectorization, :vadd, LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01))::Core.Compiler.Const(Tuple{:LoopVectorization,:LOOPCONSTANTINSTRUCTION,LoopVectorization.OperationStruct(0x0000000000000000, 0x0000000000000000, 0x0000000000000001, 0x0000000000000000, LoopVectorization.constant, 0x00, 0x01),:LoopVectorization,:getindex,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x01, 0x02),:LoopVectorization,:vadd,LoopVectorization.OperationStruct(0x0000000000000001, 0x0000000000000001, 0x0000000000000000, 0x0000000000000102, LoopVectorization.compute, 0x00, 0x01)}, false)
│   %14 = Core.apply_type(Main.Tuple, LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000))::Core.Compiler.Const(Tuple{LoopVectorization.ArrayRefStruct{:x,Symbol("##vptr##_x")}(0x0000000000000001, 0x0000000000000001, 0x0000000000000000)}, false)
│   %15 = Core.apply_type(Main.Tuple, 3)::Core.Compiler.Const(Tuple{3}, false)
│   %16 = Core.apply_type(Main.Tuple, 1)::Core.Compiler.Const(Tuple{1}, false)
│   %17 = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %18 = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %19 = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %20 = Core.apply_type(Main.Tuple)::Core.Compiler.Const(Tuple{}, false)
│   %21 = Core.apply_type(Main.Tuple, 0, %15, %16, %17, %18, %19, %20)::Core.Compiler.Const(Tuple{0,Tuple{3},Tuple{1},Tuple{},Tuple{},Tuple{},Tuple{}}, false)
│   %22 = Core.apply_type(Main.Tuple, :i)::Core.Compiler.Const(Tuple{:i}, false)
│   %23 = Core.apply_type(LoopVectorization.StaticLowerUnitRange, 1)::Core.Compiler.Const(VectorizationBase.StaticLowerUnitRange{1}, false)
│   %24 = (%23)(m)::VectorizationBase.StaticLowerUnitRange{1}
│   %25 = Core.tuple(%24)::Tuple{VectorizationBase.StaticLowerUnitRange{1}}
│   %26 = vptr##_x::VectorizationBase.PackedStridedPointer{Float32,0}
│   %27 = LoopVectorization._avx_!(%12, %13, %14, %21, %22, %25, %26, s::Core.Compiler.Const(0.0f0, false))::Any
│         (s_0 = %27)
│         (@_7 = %27)
│         $(Expr(:gc_preserve_end, :(%10)))
│         @_7
│   %32 = LoopVectorization.reduced_add(s_0, s::Core.Compiler.Const(0.0f0, false))::Any
│         (s = %32)
└──       return %32
2 ─       $(Expr(:inbounds, true))
│         Core.Compiler.Const(:(1:m), false)
│         Core.Compiler.Const(:(@_9 = Base.iterate(%36)), false)
│         Core.Compiler.Const(:(@_9 === nothing), false)
│         Core.Compiler.Const(:(Base.not_int(%38)), false)
│         Core.Compiler.Const(:(%39), false)
│         Core.Compiler.Const(:(@_9), false)
│         Core.Compiler.Const(:(i = Core.getfield(%41, 1)), false)
│         Core.Compiler.Const(:(Core.getfield(%41, 2)), false)
│         Core.Compiler.Const(:(Base.FastMath), false)
│         Core.Compiler.Const(:(Base.getproperty(%44, :add_fast)), false)
│         Core.Compiler.Const(:(s), false)
│         Core.Compiler.Const(:(Base.getindex(x, i)), false)
│         Core.Compiler.Const(:(s = (%45)(%46, %47)), false)
│         Core.Compiler.Const(:(@_9 = Base.iterate(%36, %43)), false)
│         Core.Compiler.Const(:(@_9 === nothing), false)
│         Core.Compiler.Const(:(Base.not_int(%50)), false)
│         Core.Compiler.Const(:(%51), false)
│         Core.Compiler.Const(:(goto %41), false)
│         Core.Compiler.Const(:(val = nothing), false)
│         $(Expr(:inbounds, :pop))
└──       Core.Compiler.Const(:(return val), false)

chriselrod · 2020-05-12T14:34:52Z

%27 = LoopVectorization._avx_!(%12, %13, %14, %21, %22, %25, %26, s::Core.Compiler.Const(0.0f0, false))::Any

This doesn't help me as much as the @descend_code_warntype, but I assume vzero is the problem.
Although that makes me wonder why it isn't a problem if m is hard coded as 3

baggepinnen · 2020-05-12T14:38:35Z

Here's the descent

│ ─ %-1  = invoke f(::Array{Float32,1},::Int64)::Any
Body::Any
1 ── %1   = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float32}, svec(Any), 0, :(:ccall), :(x)))::Ptr{Float32}
│           Base.arraysize(x, 1)
│    %3   = $(Expr(:gc_preserve_begin, :(x)))
│    %4   = Base.trunc_int(UInt8, m)::UInt8
│    %5   = Base.sub_int(%4, 0x01)::UInt8
│    %6   = Base.and_int(%5, 0x07)::UInt8
│    %7   = Base.add_int(%6, 0x01)::UInt8
│    %8   = Base.sub_int(0x08, %7)::UInt8
│    %9   = Base.lshr_int(0xff, %8)::UInt8
│    %10  = %new(VectorizationBase.Mask{8,UInt8}, %9)::VectorizationBase.Mask{8,UInt8}
│    %11  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %12  = VectorizationBase.SVec(%11)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %13  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %14  = VectorizationBase.SVec(%13)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %15  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %16  = VectorizationBase.SVec(%15)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %17  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %18  = VectorizationBase.SVec(%17)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %19  = Base.slt_int(m, 64)::Bool
│    %20  = Base.not_int(%19)::Bool
└───        goto #6 if not %20
2 ── %22  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %23  = VectorizationBase.SVec(%22)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %24  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %25  = VectorizationBase.SVec(%24)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %26  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
│    %27  = VectorizationBase.SVec(%26)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
│    %28  = invoke VectorizationBase.vzero(NTuple{8,VecElement{Float32}}::Type{NTuple{8,VecElement{Float32}}})::Any
└─── %29  = VectorizationBase.SVec(%28)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
3 ┄─ %30  = φ (#2 => %12, #4 => %111)::Any
│    %31  = φ (#2 => %14, #4 => %112)::Any
│    %32  = φ (#2 => %16, #4 => %113)::Any
│    %33  = φ (#2 => %18, #4 => %114)::Any
│    %34  = φ (#2 => 1, #4 => %119)::Int64
│    %35  = φ (#2 => 1, #4 => %119)::Int64
│    %36  = φ (#2 => 1, #4 => %119)::Int64
│    %37  = φ (#2 => 1, #4 => %119)::Int64
│    %38  = φ (#2 => 1, #4 => %119)::Int64
│    %39  = φ (#2 => 1, #4 => %119)::Int64
│    %40  = φ (#2 => 1, #4 => %119)::Int64
│    %41  = φ (#2 => 1, #4 => %119)::Int64
│    %42  = φ (#2 => 1, #4 => %119)::Int64
│    %43  = φ (#2 => 1, #4 => %119)::Int64
│    %44  = φ (#2 => 1, #4 => %119)::Int64
│    %45  = φ (#2 => 1, #4 => %119)::Int64
│    %46  = φ (#2 => 1, #4 => %119)::Int64
│    %47  = φ (#2 => 1, #4 => %119)::Int64
│    %48  = φ (#2 => 1, #4 => %119)::Int64
│    %49  = φ (#2 => 1, #4 => %119)::Int64
│    %50  = φ (#2 => 1, #4 => %119)::Int64
│    %51  = φ (#2 => 1, #4 => %119)::Int64
│    %52  = φ (#2 => 1, #4 => %119)::Int64
│    %53  = φ (#2 => 1, #4 => %119)::Int64
│    %54  = φ (#2 => 1, #4 => %119)::Int64
│    %55  = φ (#2 => 1, #4 => %119)::Int64
│    %56  = φ (#2 => 1, #4 => %119)::Int64
│    %57  = φ (#2 => 1, #4 => %119)::Int64
│    %58  = φ (#2 => 1, #4 => %119)::Int64
│    %59  = φ (#2 => 1, #4 => %119)::Int64
│    %60  = φ (#2 => 1, #4 => %119)::Int64
│    %61  = φ (#2 => 1, #4 => %119)::Int64
│    %62  = φ (#2 => 1, #4 => %119)::Int64
│    %63  = φ (#2 => 1, #4 => %119)::Int64
│    %64  = φ (#2 => 1, #4 => %119)::Int64
│    %65  = φ (#2 => %23, #4 => %115)::Any
│    %66  = φ (#2 => %25, #4 => %116)::Any
│    %67  = φ (#2 => %27, #4 => %117)::Any
│    %68  = φ (#2 => %29, #4 => %118)::Any
│    %69  = Base.sub_int(m, 62)::Int64
│    %70  = Base.slt_int(%34, %69)::Bool
└───        goto #5 if not %70
4 ── %72  = Base.sub_int(%35, 1)::Int64
│    %73  = Base.llvmcall::Core.IntrinsicFunction
│    %74  = (%73)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %72)::NTuple{8,VecElement{Float32}}
│    %75  = %new(VectorizationBase.SVec{8,Float32}, %74)::VectorizationBase.SVec{8,Float32}
│    %76  = Base.add_int(8, %36)::Int64
│    %77  = Base.sub_int(%76, 1)::Int64
│    %78  = Base.llvmcall::Core.IntrinsicFunction
│    %79  = (%78)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %77)::NTuple{8,VecElement{Float32}}
│    %80  = %new(VectorizationBase.SVec{8,Float32}, %79)::VectorizationBase.SVec{8,Float32}
│    %81  = Base.add_int(16, %37)::Int64
│    %82  = Base.sub_int(%81, 1)::Int64
│    %83  = Base.llvmcall::Core.IntrinsicFunction
│    %84  = (%83)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %82)::NTuple{8,VecElement{Float32}}
│    %85  = %new(VectorizationBase.SVec{8,Float32}, %84)::VectorizationBase.SVec{8,Float32}
│    %86  = Base.add_int(24, %38)::Int64
│    %87  = Base.sub_int(%86, 1)::Int64
│    %88  = Base.llvmcall::Core.IntrinsicFunction
│    %89  = (%88)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %87)::NTuple{8,VecElement{Float32}}
│    %90  = %new(VectorizationBase.SVec{8,Float32}, %89)::VectorizationBase.SVec{8,Float32}
│    %91  = Base.add_int(32, %39)::Int64
│    %92  = Base.sub_int(%91, 1)::Int64
│    %93  = Base.llvmcall::Core.IntrinsicFunction
│    %94  = (%93)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %92)::NTuple{8,VecElement{Float32}}
│    %95  = %new(VectorizationBase.SVec{8,Float32}, %94)::VectorizationBase.SVec{8,Float32}
│    %96  = Base.add_int(40, %40)::Int64
│    %97  = Base.sub_int(%96, 1)::Int64
│    %98  = Base.llvmcall::Core.IntrinsicFunction
│    %99  = (%98)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %97)::NTuple{8,VecElement{Float32}}
│    %100 = %new(VectorizationBase.SVec{8,Float32}, %99)::VectorizationBase.SVec{8,Float32}
│    %101 = Base.add_int(48, %41)::Int64
│    %102 = Base.sub_int(%101, 1)::Int64
│    %103 = Base.llvmcall::Core.IntrinsicFunction
│    %104 = (%103)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %102)::NTuple{8,VecElement{Float32}}
│    %105 = %new(VectorizationBase.SVec{8,Float32}, %104)::VectorizationBase.SVec{8,Float32}
│    %106 = Base.add_int(56, %42)::Int64
│    %107 = Base.sub_int(%106, 1)::Int64
│    %108 = Base.llvmcall::Core.IntrinsicFunction
│    %109 = (%108)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %107)::NTuple{8,VecElement{Float32}}
│    %110 = %new(VectorizationBase.SVec{8,Float32}, %109)::VectorizationBase.SVec{8,Float32}
│    %111 = LoopVectorization.vadd(%30, %75)::Any
│    %112 = LoopVectorization.vadd(%31, %80)::Any
│    %113 = LoopVectorization.vadd(%32, %85)::Any
│    %114 = LoopVectorization.vadd(%33, %90)::Any
│    %115 = LoopVectorization.vadd(%65, %95)::Any
│    %116 = LoopVectorization.vadd(%66, %100)::Any
│    %117 = LoopVectorization.vadd(%67, %105)::Any
│    %118 = LoopVectorization.vadd(%68, %110)::Any
│    %119 = Base.add_int(%43, 64)::Int64
└───        goto #3
5 ── %121 = LoopVectorization.evadd(%30, %65)::Any
│    %122 = LoopVectorization.evadd(%31, %66)::Any
│    %123 = LoopVectorization.evadd(%32, %67)::Any
└─── %124 = LoopVectorization.evadd(%33, %68)::Any
6 ┄─ %125 = φ (#5 => %121, #1 => %12)::Any
│    %126 = φ (#5 => %122, #1 => %14)::Any
│    %127 = φ (#5 => %123, #1 => %16)::Any
│    %128 = φ (#5 => %124, #1 => %18)::Any
│    %129 = φ (#5 => %44, #1 => 1)::Int64
│    %130 = φ (#5 => %45, #1 => 1)::Int64
│    %131 = φ (#5 => %46, #1 => 1)::Int64
│    %132 = φ (#5 => %47, #1 => 1)::Int64
│    %133 = φ (#5 => %48, #1 => 1)::Int64
│    %134 = φ (#5 => %49, #1 => 1)::Int64
│    %135 = φ (#5 => %50, #1 => 1)::Int64
│    %136 = φ (#5 => %51, #1 => 1)::Int64
│    %137 = φ (#5 => %52, #1 => 1)::Int64
│    %138 = φ (#5 => %53, #1 => 1)::Int64
│    %139 = φ (#5 => %54, #1 => 1)::Int64
│    %140 = φ (#5 => %55, #1 => 1)::Int64
│    %141 = φ (#5 => %56, #1 => 1)::Int64
│    %142 = φ (#5 => %57, #1 => 1)::Int64
│    %143 = φ (#5 => %58, #1 => 1)::Int64
│    %144 = φ (#5 => %59, #1 => 1)::Int64
│    %145 = φ (#5 => %60, #1 => 1)::Int64
│    %146 = φ (#5 => %61, #1 => 1)::Int64
│    %147 = φ (#5 => %62, #1 => 1)::Int64
│    %148 = φ (#5 => %63, #1 => 1)::Int64
│    %149 = φ (#5 => %64, #1 => 1)::Int64
│    %150 = Base.sub_int(m, 30)::Int64
│    %151 = Base.slt_int(%129, %150)::Bool
└───        goto #8 if not %151
7 ── %153 = Base.sub_int(%130, 1)::Int64
│    %154 = Base.llvmcall::Core.IntrinsicFunction
│    %155 = (%154)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %153)::NTuple{8,VecElement{Float32}}
│    %156 = %new(VectorizationBase.SVec{8,Float32}, %155)::VectorizationBase.SVec{8,Float32}
│    %157 = Base.add_int(8, %131)::Int64
│    %158 = Base.sub_int(%157, 1)::Int64
│    %159 = Base.llvmcall::Core.IntrinsicFunction
│    %160 = (%159)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %158)::NTuple{8,VecElement{Float32}}
│    %161 = %new(VectorizationBase.SVec{8,Float32}, %160)::VectorizationBase.SVec{8,Float32}
│    %162 = Base.add_int(16, %132)::Int64
│    %163 = Base.sub_int(%162, 1)::Int64
│    %164 = Base.llvmcall::Core.IntrinsicFunction
│    %165 = (%164)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %163)::NTuple{8,VecElement{Float32}}
│    %166 = %new(VectorizationBase.SVec{8,Float32}, %165)::VectorizationBase.SVec{8,Float32}
│    %167 = Base.add_int(24, %133)::Int64
│    %168 = Base.sub_int(%167, 1)::Int64
│    %169 = Base.llvmcall::Core.IntrinsicFunction
│    %170 = (%169)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %168)::NTuple{8,VecElement{Float32}}
│    %171 = %new(VectorizationBase.SVec{8,Float32}, %170)::VectorizationBase.SVec{8,Float32}
│    %172 = LoopVectorization.vadd(%125, %156)::Any
│    %173 = LoopVectorization.vadd(%126, %161)::Any
│    %174 = LoopVectorization.vadd(%127, %166)::Any
│    %175 = LoopVectorization.vadd(%128, %171)::Any
└─── %176 = Base.add_int(%134, 32)::Int64
8 ┄─ %177 = φ (#7 => %172, #6 => %125)::Any
│    %178 = φ (#7 => %173, #6 => %126)::Any
│    %179 = φ (#7 => %174, #6 => %127)::Any
│    %180 = φ (#7 => %175, #6 => %128)::Any
│    %181 = φ (#7 => %176, #6 => %135)::Int64
│    %182 = φ (#7 => %176, #6 => %136)::Int64
│    %183 = φ (#7 => %176, #6 => %137)::Int64
│    %184 = φ (#7 => %176, #6 => %138)::Int64
│    %185 = φ (#7 => %176, #6 => %139)::Int64
│    %186 = φ (#7 => %176, #6 => %140)::Int64
│    %187 = φ (#7 => %176, #6 => %141)::Int64
│    %188 = φ (#7 => %176, #6 => %142)::Int64
│    %189 = φ (#7 => %176, #6 => %143)::Int64
│    %190 = φ (#7 => %176, #6 => %144)::Int64
│    %191 = φ (#7 => %176, #6 => %145)::Int64
│    %192 = φ (#7 => %176, #6 => %146)::Int64
│    %193 = φ (#7 => %176, #6 => %147)::Int64
│    %194 = φ (#7 => %176, #6 => %148)::Int64
│    %195 = φ (#7 => %176, #6 => %149)::Int64
│    %196 = Base.slt_int(m, %181)::Bool
└───        goto #10 if not %196
9 ──        goto #18
10 ─ %199 = Base.sub_int(m, 8)::Int64
│    %200 = Base.slt_int(%199, %182)::Bool
└───        goto #12 if not %200
11 ─ %202 = Base.sub_int(%183, 1)::Int64
│    %203 = Base.llvmcall::Core.IntrinsicFunction
│    %204 = (%203)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %1, %202, %9)::NTuple{8,VecElement{Float32}}
│    %205 = %new(VectorizationBase.SVec{8,Float32}, %204)::VectorizationBase.SVec{8,Float32}
│    %206 = LoopVectorization.vadd(%177, %205)::Any
│    %207 = LoopVectorization.vifelse(%10, %206, %177)::Any
└───        goto #18
12 ─ %209 = Base.sub_int(m, 16)::Int64
│    %210 = Base.slt_int(%209, %184)::Bool
└───        goto #14 if not %210
13 ─ %212 = Base.sub_int(%185, 1)::Int64
│    %213 = Base.llvmcall::Core.IntrinsicFunction
│    %214 = (%213)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %212)::NTuple{8,VecElement{Float32}}
│    %215 = %new(VectorizationBase.SVec{8,Float32}, %214)::VectorizationBase.SVec{8,Float32}
│    %216 = Base.add_int(8, %186)::Int64
│    %217 = Base.sub_int(%216, 1)::Int64
│    %218 = Base.llvmcall::Core.IntrinsicFunction
│    %219 = (%218)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %1, %217, %9)::NTuple{8,VecElement{Float32}}
│    %220 = %new(VectorizationBase.SVec{8,Float32}, %219)::VectorizationBase.SVec{8,Float32}
│    %221 = LoopVectorization.vadd(%177, %215)::Any
│    %222 = LoopVectorization.vadd(%178, %220)::Any
│    %223 = LoopVectorization.vifelse(%10, %222, %178)::Any
└───        goto #18
14 ─ %225 = Base.sub_int(m, 24)::Int64
│    %226 = Base.slt_int(%225, %187)::Bool
└───        goto #16 if not %226
15 ─ %228 = Base.sub_int(%188, 1)::Int64
│    %229 = Base.llvmcall::Core.IntrinsicFunction
│    %230 = (%229)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %228)::NTuple{8,VecElement{Float32}}
│    %231 = %new(VectorizationBase.SVec{8,Float32}, %230)::VectorizationBase.SVec{8,Float32}
│    %232 = Base.add_int(8, %189)::Int64
│    %233 = Base.sub_int(%232, 1)::Int64
│    %234 = Base.llvmcall::Core.IntrinsicFunction
│    %235 = (%234)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %233)::NTuple{8,VecElement{Float32}}
│    %236 = %new(VectorizationBase.SVec{8,Float32}, %235)::VectorizationBase.SVec{8,Float32}
│    %237 = Base.add_int(16, %190)::Int64
│    %238 = Base.sub_int(%237, 1)::Int64
│    %239 = Base.llvmcall::Core.IntrinsicFunction
│    %240 = (%239)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %1, %238, %9)::NTuple{8,VecElement{Float32}}
│    %241 = %new(VectorizationBase.SVec{8,Float32}, %240)::VectorizationBase.SVec{8,Float32}
│    %242 = LoopVectorization.vadd(%177, %231)::Any
│    %243 = LoopVectorization.vadd(%178, %236)::Any
│    %244 = LoopVectorization.vadd(%179, %241)::Any
│    %245 = LoopVectorization.vifelse(%10, %244, %179)::Any
└───        goto #18
16 ─ %247 = Base.sub_int(m, 32)::Int64
│    %248 = Base.slt_int(%247, %191)::Bool
└───        goto #18 if not %248
17 ─ %250 = Base.sub_int(%192, 1)::Int64
│    %251 = Base.llvmcall::Core.IntrinsicFunction
│    %252 = (%251)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %250)::NTuple{8,VecElement{Float32}}
│    %253 = %new(VectorizationBase.SVec{8,Float32}, %252)::VectorizationBase.SVec{8,Float32}
│    %254 = Base.add_int(8, %193)::Int64
│    %255 = Base.sub_int(%254, 1)::Int64
│    %256 = Base.llvmcall::Core.IntrinsicFunction
│    %257 = (%256)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %255)::NTuple{8,VecElement{Float32}}
│    %258 = %new(VectorizationBase.SVec{8,Float32}, %257)::VectorizationBase.SVec{8,Float32}
│    %259 = Base.add_int(16, %194)::Int64
│    %260 = Base.sub_int(%259, 1)::Int64
│    %261 = Base.llvmcall::Core.IntrinsicFunction
│    %262 = (%261)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\n", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%res = load <8 x float>, <8 x float>* %ptr, align 4, !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64}, %1, %260)::NTuple{8,VecElement{Float32}}
│    %263 = %new(VectorizationBase.SVec{8,Float32}, %262)::VectorizationBase.SVec{8,Float32}
│    %264 = Base.add_int(24, %195)::Int64
│    %265 = Base.sub_int(%264, 1)::Int64
│    %266 = Base.llvmcall::Core.IntrinsicFunction
│    %267 = (%266)(("!1 = !{!\"noaliasdomain\"}\n!2 = !{!\"noaliasscope\", !1}\n!3 = !{!2}\ndeclare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)", "%typptr = inttoptr i64 %0 to float*\n%offsetptr = getelementptr inbounds float, float* %typptr, i64 %1\n%ptr = bitcast float* %offsetptr to <8 x float>*\n%mask = bitcast i8 %2 to <8 x i1>\n%res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ptr, i32 4, <8 x i1> %mask, <8 x float> zeroinitializer), !alias.scope !3\nret <8 x float> %res"), NTuple{8,VecElement{Float32}}, Tuple{Ptr{Float32},Int64,UInt8}, %1, %265, %9)::NTuple{8,VecElement{Float32}}
│    %268 = %new(VectorizationBase.SVec{8,Float32}, %267)::VectorizationBase.SVec{8,Float32}
│    %269 = LoopVectorization.vadd(%177, %253)::Any
│    %270 = LoopVectorization.vadd(%178, %258)::Any
│    %271 = LoopVectorization.vadd(%179, %263)::Any
│    %272 = LoopVectorization.vadd(%180, %268)::Any
└─── %273 = LoopVectorization.vifelse(%10, %272, %180)::Any
18 ┄ %274 = φ (#9 => %177, #11 => %207, #13 => %221, #15 => %242, #17 => %269, #16 => %177)::Any
│    %275 = φ (#9 => %178, #11 => %178, #13 => %223, #15 => %243, #17 => %270, #16 => %178)::Any
│    %276 = φ (#9 => %179, #11 => %179, #13 => %179, #15 => %245, #17 => %271, #16 => %179)::Any
│    %277 = φ (#9 => %180, #11 => %180, #13 => %180, #15 => %180, #17 => %273, #16 => %180)::Any
│    %278 = LoopVectorization.evadd(%274, %276)::Any
│    %279 = LoopVectorization.evadd(%275, %277)::Any
│    %280 = LoopVectorization.evadd(%278, %279)::Any
│    %281 = LoopVectorization.extract_data(%280)::Any
└───        goto #19
19 ─        $(Expr(:gc_preserve_end, :(%3)))
│    %284 = LoopVectorization.reduced_add::Core.Compiler.Const(SIMDPirates.reduced_add, false)
│    %285 = (isa)(%281, Float32)::Bool
└───        goto #21 if not %285
20 ─ %287 = π (%281, Float32)
│    %288 = invoke %284(%287::Float32, 0.0f0::Float32)::Any
└───        goto #22
21 ─ %290 = LoopVectorization.reduced_add(%281, 0.0f0)::Any
└───        goto #22
22 ┄ %292 = φ (#20 => %288, #21 => %290)::Any
└───        return %292

Select a call to descend into or ↩ to ascend. [q]uit.
Toggles: [o]ptimize, [w]arn, [d]ebuginfo, [s]yntax highlight for Source/LLVM/Native.
Show: [S]ource code, [A]ST, [L]LVM IR, [N]ative code
Advanced: dump [P]arams cache.

 • %11  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %12  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %13  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %14  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %15  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %16  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %17  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %18  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %22  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
v  %23  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}

chriselrod · 2020-05-12T14:40:03Z

What happens if you descend into the

%11  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any

?

baggepinnen · 2020-05-12T14:45:00Z

It's really weird, before I descend it infers vzero::Any, but when inside it infers correctly..

Select a call to descend into or ↩ to ascend. [q]uit.
Toggles: [o]ptimize, [w]arn, [d]ebuginfo, [s]yntax highlight for Source/LLVM/Native.
Show: [S]ource code, [A]ST, [L]LVM IR, [N]ative code
Advanced: dump [P]arams cache.

 • %11  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %12  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %13  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %14  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %15  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %16  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %17  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
   %18  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}
   %22  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::Any
v  %23  = call Type(::Any)::Union{Unsigned, VectorizationBase.SVec{_A,_B} where _B where _A}

│ ─ %-1  = invoke vzero(::Type{NTuple{8,VecElement{Float32}}})::NTuple{8,VecElement{Float32}}
Body::NTuple{8,VecElement{Float32}}
1 ─ %1 = Base.llvmcall::Core.Compiler.Const(Core.Intrinsics.llvmcall, false)
│   %2 = (%1)("ret <8 x float> zeroinitializer\n", NTuple{8,VecElement{Float32}}, Tuple{})::NTuple{8,VecElement{Float32}}
└──      return %2

Select a call to descend into or ↩ to ascend. [q]uit.
Toggles: [o]ptimize, [w]arn, [d]ebuginfo, [s]yntax highlight for Source/LLVM/Native.
Show: [S]ource code, [A]ST, [L]LVM IR, [N]ative code
Advanced: dump [P]arams cache.

chriselrod · 2020-05-12T14:58:48Z

Any idea how to fix this?

Is there some sort of heuristic inference could be hitting that just makes it give up?

Maybe instead of:

@generated function vzero(::Type{Vec{W,T}}) where {W,T}
    typ = llvmtype(T)
    vtyp = "<$W x $typ>"
    instrs = """
    ret $vtyp zeroinitializer
    """
    quote
        $(Expr(:meta,:inline))
        Base.llvmcall($instrs, Vec{$W,$T}, Tuple{}, )
    end
end
@inline vzero(::Val{W}, ::Type{T}) where {W,T} = SVec(vzero(Vec{W,T}))

It would be nicer to the compiler to have

@generated function vzero(::Val{W}, ::Type{T}) where {W,T}
    typ = llvmtype(T)
    vtyp = "<$W x $typ>"
    instrs = """
    ret $vtyp zeroinitializer
    """
    quote
        $(Expr(:meta,:inline))
        SVec(Base.llvmcall($instrs, Vec{$W,$T}, Tuple{}, ))
    end
end

for the sake of less depth?

I'd still want the old definition (the one taking a ::Type{Vec{W,T}} argument), so that would mean a lot of code duplication if applied generally.

baggepinnen · 2020-05-12T15:01:21Z

I am completely lost. It infers fine for Float64, Int64, Int32 (and produces nice vectorized code) but fails for Float32. Is there something special in how Float32 are treated in VectorizationBase?

chriselrod · 2020-05-12T15:03:59Z

Is there something special in how Float32 are treated in VectorizationBase?

No. I am also completely lost.

I assume you've tried starting a new Julia session to see if the problem magically goes away?

baggepinnen · 2020-05-12T15:15:20Z

I have tried latest nightly binary and restarted several times, same problem. Tried in both Juno and terminal as well.
I tried Julia LTS v1.0.5 and the code inferred okay, but that installed a very old version of LV
LoopVectorization v0.1.3.

chriselrod · 2020-05-12T15:23:01Z

You can try add LoopVectorization#julia1 for a recent LoopVectorization version that passes tests (locally, at least) on 1.0.

baggepinnen · 2020-05-12T15:30:12Z

With the julia1 branch the problem is back also on Julia v1.0.5

chriselrod · 2020-05-12T15:42:09Z

Thanks for confirming. It was initializing zeros differently back in those days. Apparently in a way friendlier to inference.

Mind trying random things?
Maybe I shouldn't be doing it on VectorizationBase#master, but that's where my first attempt is.
Could you add it and let me know if the problem goes away?

If not, I'll create a branch and try something else.

baggepinnen · 2020-05-12T15:47:53Z

Amazing, VectorizationBase#master infers fine!

chriselrod · 2020-05-12T15:57:40Z

Great!
All I did was the change suggested here:
#114 (comment)

I don't think it should have made a difference, but it seems like inference was just giving up after inlining too many things with Float32 for some reason?
I need to read more about how Julia's compiler and inference work.

For now, I'm considering adding this:

for T ∈ [Float16,Float32,Float64,Int8,Int16,Int32,Int64,UInt8,UInt16,UInt32,UInt64]
    maxW = pick_vector_width(T)
    typ = llvmtype(T)
    for log2W ∈ 0:intlog2(maxW)
        W = 1 << log2W
        instrs = "ret <$W x $typ> zeroinitializer"
        @eval @inline vzero(::Val{$W}, ::Type{$T}) = SVec(Base.llvmcall($instrs, Vec{$W,$T}, Tuple{}, ))
    end
end

So that most uses avoid a generated function altogether. Which may or may not help, but I saw an example before where inference gave up with a generated function but worked through the equivalent normal code.
Not knowing the details about the internals, I can only speculate.

chriselrod · 2020-05-13T17:11:58Z

I tagged a new release of VectorizationBase. Good to close this?

baggepinnen · 2020-05-13T19:06:34Z

Great, thanks as always for excellent attention to issues!

baggepinnen mentioned this issue May 12, 2020

problem with the unreliable approximation of Core.Compiler.return_type JuliaLang/julia#35800

Open

baggepinnen closed this as completed May 13, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

@avx messes with type inference #114

@avx messes with type inference #114

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020

chriselrod commented May 13, 2020

baggepinnen commented May 13, 2020

@avx messes with type inference #114

@avx messes with type inference #114

Comments

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020 • edited Loading

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020 • edited Loading

baggepinnen commented May 12, 2020

chriselrod commented May 12, 2020

chriselrod commented May 13, 2020

baggepinnen commented May 13, 2020

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

baggepinnen commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading

chriselrod commented May 12, 2020 •

edited

Loading