From b3f52189c6e766ea3be7a469f481c87523f0a782 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Tue, 3 Jun 2014 15:47:23 -0500
Subject: [PATCH 1/4] Refactoring reducedim and related functions

---
 base/bitarray.jl  |   3 +-
 base/reducedim.jl | 311 ++++++++++++++++++++++++----------------------
 test/reducedim.jl |   4 +-
 3 files changed, 167 insertions(+), 151 deletions(-)

diff --git a/base/bitarray.jl b/base/bitarray.jl
index b17a2c11c3290..e3a523dc9ff33 100644
--- a/base/bitarray.jl
+++ b/base/bitarray.jl
@@ -1385,8 +1385,7 @@ end
 
 ## Reductions ##
 
-sum(A::BitArray, region) = reducedim(+, A, region, 0, Array(Int,reduced_dims(A,region)))
-
+sum(A::BitArray, region) = reducedim(AddFun(), A, region)
 sum(B::BitArray) = countnz(B)
 
 function all(B::BitArray)
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 5e562ace3f1e8..807e153f6cc5a 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -5,14 +5,11 @@ reduced_dims(a::AbstractArray, region) = reduced_dims(size(a), region)
 
 # for reductions that keep 0 dims as 0
 reduced_dims0(a::AbstractArray, region) = reduced_dims0(size(a), region)
-
 reduced_dims{N}(siz::NTuple{N,Int}, d::Int, rd::Int) = (d == 1 ? tuple(rd, siz[d+1:N]...) :
                                                         d == N ? tuple(siz[1:N-1]..., rd) :
                                                         1 < d < N ? tuple(siz[1:d-1]..., rd, siz[d+1:N]...) : 
                                                         siz)::typeof(siz)
-
 reduced_dims{N}(siz::NTuple{N,Int}, d::Int) = reduced_dims(siz, d, 1)
-
 reduced_dims0{N}(siz::NTuple{N,Int}, d::Int) = 1 <= d <= N ? reduced_dims(siz, d, (siz[d] == 0 ? 0 : 1)) : siz
 
 function reduced_dims{N}(siz::NTuple{N,Int}, region)
@@ -46,184 +43,202 @@ end
 
 ###### Generic reduction functions #####
 
-reducedim(f::Function, A, region, initial) = reducedim!(f, reduction_init(A, region, initial), A)
+## initialization
 
-reducedim(f::Function, A, region, initial, R) = reducedim!(f, fill!(R, initial), A)
+for (Op, initfun) in ((:AddFun, :zero), (:MulFun, :one), (:MaxFun, :typemin), (:MinFun, :typemax))
+    @eval initarray!{T}(a::AbstractArray{T}, ::$(Op), init::Bool) = (init && fill!(a, $(initfun)(T)); a)
+end
 
-function reducedim!_function(N::Int, f::Function)
-    body = gen_reduction_body(N, f)
-    @eval begin
-        local _F_
-        function _F_(R, A)
-            $body
-        end
-        _F_
-    end
+for (Op, initval) in ((:AndFun, true), (:OrFun, false))
+    @eval initarray!(a::AbstractArray, ::$(Op), init::Bool) = (init && fill!(a, $initval); a)
 end
 
-let reducedim_cache = Dict()
-# reducedim! assumes that R has already been initialized with a seed value
-global reducedim!
-function reducedim!(f::Function, R, A)
-    if isempty(R)
-        return R
+reducedim_initarray{R}(A::AbstractArray, region, v0, ::Type{R}) = fill!(similar(A,R,reduced_dims(A,region)), v0)
+reducedim_initarray{T}(A::AbstractArray, region, v0::T) = reducedim_initarray(A, region, v0, T)
+
+reducedim_initarray0{R}(A::AbstractArray, region, v0, ::Type{R}) = fill!(similar(A,R,reduced_dims0(A,region)), v0)
+reducedim_initarray0{T}(A::AbstractArray, region, v0::T) = reducedim_initarray0(A, region, v0, T)
+
+# TODO: better way to handle reducedim initialization
+#
+# The current scheme is basically following Steven G. Johnson's original implementation
+#
+function reducedim_init{T}(f, op::AddFun, A::AbstractArray{T}, region)
+    if method_exists(zero, (Type{T},))
+        x = evaluate(f, zero(T))
+        z = zero(x) + zero(x)
+        Tr = typeof(z) == typeof(x) && !isbits(T) ? T : typeof(z)
+    else
+        z = zero(sum(f, A))
+        Tr = typeof(z)
     end
-    ndimsA = ndims(A)
-    key = (ndimsA, f)
-    if !haskey(reducedim_cache,key)
-        func = reducedim!_function(ndimsA, f)
-        reducedim_cache[key] = func
+    return reducedim_initarray(A, region, z, Tr)
+end
+
+function reducedim_init{T}(f, op::MulFun, A::AbstractArray{T}, region)
+    if method_exists(zero, (Type{T},))
+        x = evaluate(f, zero(T))
+        z = one(x) * one(x)
+        Tr = typeof(z) == typeof(x) && !isbits(T) ? T : typeof(z)
     else
-        func = reducedim_cache[key]
+        z = one(prod(f, A))
+        Tr = typeof(z)
     end
-    func(R, A)::typeof(R)
+    return reducedim_initarray(A, region, z, Tr)
 end
-end  # let reducedim_cache
 
-# Generate the body for a reduction function reduce!(f, R, A), using binary operation f,
-# where R is the output and A is the input.
-# R must have already been set to the appropriate size and initialized with the seed value
-function gen_reduction_body(N, f::Function)
-    F = Expr(:quote, f)
-    quote
-        (isempty(R) || isempty(A)) && return R
-        for i = 1:$N
-            (size(R, i) == size(A, i) || size(R, i) == 1) || throw(DimensionMismatch("Reduction on array of size $(size(A)) with output of size $(size(R))"))
-        end
-        @nextract $N sizeR d->size(R,d)
-        # If we're reducing along dimension 1, for efficiency we can make use of a temporary.
-        # Otherwise, keep the result in R so that we traverse A in storage order.
-        if size(R, 1) < size(A, 1)
-            @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-                @inbounds tmp = (@nref $N R j)
-                for i_1 = 1:size(A,1)
-                    @inbounds tmp = ($F)(tmp, (@nref $N A i))
+reducedim_init{T}(f, op::MaxFun, A::AbstractArray{T}, region) = reducedim_initarray0(A, region, typemin(evaluate(f, zero(T))))
+reducedim_init{T}(f, op::MinFun, A::AbstractArray{T}, region) = reducedim_initarray0(A, region, typemax(evaluate(f, zero(T))))
+reducedim_init{T}(f::Union(AbsFun,Abs2Fun), op::MaxFun, A::AbstractArray{T}, region) = 
+    reducedim_initarray(A, region, zero(evaluate(f, zero(T))))
+
+reducedim_init(f, op::AndFun, A::AbstractArray, region) = reducedim_initarray(A, region, true)
+reducedim_init(f, op::OrFun, A::AbstractArray, region) = reducedim_initarray(A, region, false)
+
+
+## generic (map)reduction
+
+has_fast_linear_indexing(a::AbstractArray) = false
+has_fast_linear_indexing(a::Array) = true
+
+function check_reducdims(R, A)
+    # Check whether R has compatible dimensions w.r.t. A for reduction
+    #
+    # It returns an integer value value (useful for choosing implementation)
+    # - If it reduces only along leading dimensions, e.g. sum(A, 1) or sum(A, (1, 2)),
+    #   it returns the length of the leading slice. For the two examples above, 
+    #   it will be size(A, 1) or size(A, 1) * size(A, 2).
+    # - Otherwise, e.g. sum(A, 2) or sum(A, (1, 3)), it returns 0.
+    #
+    lsiz = 1
+    had_nonreduc = false
+    for i = 1:ndims(A)
+        sRi = size(R, i)
+        sAi = size(A, i)
+        if sRi == 1
+            if sAi > 1 
+                if had_nonreduc
+                    lsiz = 0  # to reduce along i, but some previous dimensions were non-reducing
+                else
+                    lsiz *= sAi  # if lsiz was set to zero, it will stay to be zero
                 end
-                @inbounds (@nref $N R j) = tmp
             end
         else
-            @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-                @inbounds (@nref $N R j) = ($F)((@nref $N R j), (@nref $N A i))
-            end
+            sRi == sAi || 
+                throw(DimensionMismatch("Reduction on array of size $(size(A)) with output of size $(size(R))"))
+            had_nonreduc = true
         end
-        R
     end
+    return lsiz
 end
 
-reduction_init{T}(A::AbstractArray, region, initial::T, Tr=T) = fill!(similar(A,Tr,reduced_dims(A,region)), initial)
-
-function initarray!{T}(a::AbstractArray{T}, v::T, init::Bool) 
-    if init 
-        fill!(a, v)
-    end
-    return a
-end
-
-
-##### Specific reduction functions #####
-
-## sum
-
-@ngenerate N typeof(R) function _sum!{T,N}(f, R::AbstractArray, A::AbstractArray{T,N})
-    (isempty(R) || isempty(A)) && return R
-    rdims = 0
-    for i = 1:N
-        if size(R, i) == size(A, i)
-        elseif size(R, i) == 1
-            rdims += 1
-        else
-            throw(DimensionMismatch("sum of array of size $(size(A)) with output of size $(size(R))"))
-        end
-    end
-    @nextract N sizeR d->size(R,d)
-    # If we're reducing along dimension 1 and dimension 1 is
-    # sufficiently large, use the pairwise implementation. Otherwise,
-    # keep the result in R so that we traverse A in storage order.
-    sz1 = size(A, 1)
-    if size(R, 1) < sz1 && sz1 >= 16 && rdims == 1
-        for i = 1:div(length(A), sz1)
-            @inbounds R[i] = mapreduce_impl(f, AddFun(), A, (i-1)*sz1+1, i*sz1)
-        end
-    else
-        @nloops N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-            @inbounds (@nref N R j) += evaluate(f, @nref N A i)
+function mapreducedim!_func(N::Int)
+    @eval begin
+        local _F_
+        function _F_(f, op, R, A)
+            lsiz = check_reducdims(R, A)
+            isempty(A) && return R
+            @nextract $N sizeR d->size(R,d)
+            sizA1 = size(A, 1)
+
+            if has_fast_linear_indexing(A) && lsiz > 16
+                # use mapreduce_impl, which is probably better tuned to achieve higher performance
+                nslices = div(length(A), lsiz)
+                ibase = 0
+                for i = 1:nslices
+                    @inbounds R[i] = mapreduce_impl(f, op, A, ibase+1, ibase+lsiz)
+                    ibase += lsiz
+                end
+            elseif size(R, 1) == 1 && sizA1 > 1
+                # keep the accumulator as a local variable when reducing along the first dimension
+                @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                    @inbounds r = (@nref $N R j)
+                    for i_1 = 1:sizA1
+                        @inbounds v = evaluate(f, (@nref $N A i))
+                        r = evaluate(op, r, v)
+                    end
+                    @inbounds (@nref $N R j) = r
+                end 
+            else
+                # general implementation
+                @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                    @inbounds v = evaluate(f, (@nref $N A i))
+                    @inbounds (@nref $N R j) = evaluate(op, (@nref $N R j), v)
+                end
+            end
+            return R
         end
-    end
-    R
+        _F_
+    end  # @eval
 end
 
-function sum{T}(f::Union(Function,Func{1}), A::AbstractArray{T}, region)
-    if method_exists(zero, (Type{T},))
-        fz = evaluate(f, zero(T))
-        z = fz + fz
-        Tr = typeof(z) == typeof(fz) && !isbits(T) ? T : typeof(z)
+let mapreducedim_fcache = (Int=>Function)[]
+global _mapreducedim!
+function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
+    isempty(R) && return R
+    ndimsA = ndims(A)
+    if !haskey(mapreducedim_fcache, ndimsA)
+        func! = mapreducedim!_func(ndimsA)
+        mapreducedim_fcache[ndimsA] = func!
     else
-        # TODO: handle more heterogeneous sums.  e.g. sum(A, 1) where
-        # A is a Matrix{Any} with one column of numbers and one of vectors
-        z = zero(sum(f, A))
-        Tr = typeof(z)
+        func! = mapreducedim_fcache[ndimsA]
     end
-    _sum!(f, reduction_init(A, region, z, Tr), A)
+    func!(f, op, R, A)::typeof(R)
 end
+end # let mapreducedim_fcache
 
-sum!{R}(f::Union(Function,Func{1}), r::AbstractArray{R}, A::AbstractArray; init::Bool=true) =
-    _sum!(f, initarray!(r, zero(R), init), A)
+mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) = _mapreducedim!(f, op, R, A)
 
-for (fname, func) in ((:sum, :IdFun), (:sumabs, :AbsFun), (:sumabs2, :Abs2Fun))
-    @eval begin
-        $fname(A::AbstractArray, region) = sum($func(), A, region)
-        $(symbol("$(fname)!")){R}(r::AbstractArray{R}, A::AbstractArray; init::Bool=true) =
-            sum!($func(), r, A; init=init)
-    end
+function mapreducedim!(f::Function, op, R::AbstractArray, A::AbstractArray)
+    is(op, +) ? _mapreducedim!(f, AddFun(), R, A) :
+    is(op, *) ? _mapreducedim!(f, MulFun(), R, A) :
+    is(op, &) ? _mapreducedim!(f, AndFun(), R, A) :
+    is(op, |) ? _mapreducedim!(f, OrFun(), R, A) :
+    _mapreducedim!(f, op, R, A)
 end
 
+reducedim!{RT}(op, R::AbstractArray{RT}, A::AbstractArray) = mapreducedim!(IdFun(), op, R, A, zero(RT))
 
-## prod
-
-eval(ngenerate(:N, :(typeof(R)), :(_prod!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, *)))
-prod!{R}(r::AbstractArray{R}, A::AbstractArray; init::Bool=true) = _prod!(initarray!(r, one(R), init), A)
-
-function prod{T}(A::AbstractArray{T}, region)
-    if method_exists(one, (Type{T},))
-        z = one(T) * one(T)
-        Tr = typeof(z) == typeof(one(T)) ? T : typeof(z)
-    else
-        # TODO: handle more heterogeneous products.  e.g. prod(A, 1) where
-        # A is a Matrix{Any} with one column of numbers and one of vectors
-        z = one(prod(A))
-        Tr = typeof(z)
-    end
-    _prod!(reduction_init(A, region, z, Tr), A)
-end
-
-prod(A::AbstractArray{Bool}, region) = error("use all() instead of prod() for boolean arrays")
+mapreducedim(f, op, A::AbstractArray, region, v0) = mapreducedim!(f, op, reducedim_initarray(A, region, v0), A)
+mapreducedim{T}(f, op, A::AbstractArray{T}, region) = mapreducedim!(f, op, reducedim_init(f, op, A, region), A)
 
+reducedim(op, A::AbstractArray, region, v0) = mapreducedim(IdFun(), op, A, region, v0)
+reducedim(op, A::AbstractArray, region) = mapreducedim(IdFun(), op, A, region)
 
-## maximum & minimum
 
-eval(ngenerate(:N, :(typeof(R)), :(_maximum!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, scalarmax)))
-maximum!{R}(r::AbstractArray{R}, A::AbstractArray; init::Bool=true) = _maximum!(initarray!(r, typemin(R), init), A)
-maximum{T}(A::AbstractArray{T}, region) =
-    isempty(A) ? similar(A,reduced_dims0(A,region)) : _maximum!(reduction_init(A, region, typemin(T)), A)
-
-eval(ngenerate(:N, :(typeof(R)), :(_minimum!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, scalarmin)))
-minimum!{R}(r::AbstractArray{R}, A::AbstractArray; init::Bool=true) = _minimum!(initarray!(r, typemax(R), init), A)
-minimum{T}(A::AbstractArray{T}, region) =
-    isempty(A) ? similar(A, reduced_dims0(A, region)) : _minimum!(reduction_init(A, region, typemax(T)), A)
+##### Specific reduction functions #####
 
+for (fname, Op) in [(:sum, :AddFun), (:prod, :MulFun), 
+                    (:maximum, :MaxFun), (:minimum, :MinFun), 
+                    (:all, :AndFun), (:any, :OrFun)]
 
-## all & any
+    fname! = symbol(string(fname, '!'))
+    @eval begin 
+        $(fname!)(f::Union(Function,Func{1}), r::AbstractArray, A::AbstractArray; init::Bool=true) = 
+            mapreducedim!(f, $(Op)(), initarray!(r, $(Op)(), init), A)
+        $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = $(fname!)(IdFun(), r, A; init=init)
 
-eval(ngenerate(:N, :(typeof(R)), :(_all!{N}(R::AbstractArray, A::AbstractArray{Bool,N})), N->gen_reduction_body(N, &)))
-all!(r::AbstractArray, A::AbstractArray{Bool}; init::Bool=true) = _all!(initarray!(r, true, init), A)
-all(A::AbstractArray{Bool}, region) = _all!(reduction_init(A, region, true), A)
+        $(fname)(f::Union(Function,Func{1}), A::AbstractArray, region) = 
+            mapreducedim(f, $(Op)(), A, region)
+        $(fname)(A::AbstractArray, region) = $(fname)(IdFun(), A, region)
+    end
+end
 
-eval(ngenerate(:N, :(typeof(R)), :(_any!{N}(R::AbstractArray, A::AbstractArray{Bool,N})), N->gen_reduction_body(N, |)))
-any!(r::AbstractArray, A::AbstractArray{Bool}; init::Bool=true) = _any!(initarray!(r, false, init), A)
-any(A::AbstractArray{Bool}, region) = _any!(reduction_init(A, region, false), A)
+for (fname, fbase, Fun) in [(:sumabs, :sum, :AbsFun), 
+                            (:sumabs2, :sum, :Abs2Fun), 
+                            (:maxabs, :maximum, :AbsFun), 
+                            (:minabs, :minimum, :AbsFun)]
+    fname! = symbol(string(fname, '!'))
+    fbase! = symbol(string(fbase, '!'))
+    @eval begin 
+        $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = 
+            $(fbase!)($(Fun)(), r, A; init=init)
+        $(fname)(A::AbstractArray, region) = $(fbase)($(Fun)(), A, region)
+    end
+end
 
 
-## findmin & findmax
+##### findmin & findmax #####
 
 # Generate the body for a reduction function reduce!(f, Rval, Rind, A), using a comparison operator f
 # Rind contains the index of A from which Rval was taken
@@ -272,10 +287,12 @@ eval(ngenerate(:N, :(typeof((Rval,Rind))), :(_findmin!{T,N}(Rval::AbstractArray,
 findmin!{R}(rval::AbstractArray{R}, rind::AbstractArray, A::AbstractArray; init::Bool=true) = _findmin!(initarray!(rval, typemax(R), init), rind, A)
 findmin{T}(A::AbstractArray{T}, region) = 
     isempty(A) ? (similar(A,reduced_dims0(A,region)), zeros(Int,reduced_dims0(A,region))) :
-                  _findmin!(reduction_init(A, region, typemax(T)), zeros(Int,reduced_dims0(A,region)), A)
+                  _findmin!(reducedim_initarray0(A, region, typemax(T)), zeros(Int,reduced_dims0(A,region)), A)
 
 eval(ngenerate(:N, :(typeof((Rval,Rind))), :(_findmax!{T,N}(Rval::AbstractArray, Rind::AbstractArray, A::AbstractArray{T,N})), N->gen_findreduction_body(N, >)))
 findmax!{R}(rval::AbstractArray{R}, rind::AbstractArray, A::AbstractArray; init::Bool=true) = _findmax!(initarray!(rval, typemin(R), init), rind, A)
 findmax{T}(A::AbstractArray{T}, region) = 
     isempty(A) ? (similar(A,reduced_dims0(A,region)), zeros(Int,reduced_dims0(A,region))) :
-                  _findmax!(reduction_init(A, region, typemin(T)), zeros(Int,reduced_dims0(A,region)), A)
+                  _findmax!(reducedim_initarray0(A, region, typemin(T)), zeros(Int,reduced_dims0(A,region)), A)
+
+
diff --git a/test/reducedim.jl b/test/reducedim.jl
index 2d35070c35fb7..6f8b9b18b004a 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -13,9 +13,9 @@ safe_sumabs2{T}(A::Array{T}, region) = safe_mapslices(sum, abs2(A), region)
 
 Areduc = rand(3, 4, 5, 6)
 for region in {
-        1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4),
+    1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4),
     (1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)}
-
+    # println("region = $region")
     r = fill(NaN, Base.reduced_dims(size(Areduc), region))
     @test_approx_eq sum!(r, Areduc) safe_sum(Areduc, region)
     @test_approx_eq prod!(r, Areduc) safe_prod(Areduc, region)

From 34410d11ef8b012856ac106e95b323fc7e8c7880 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Wed, 4 Jun 2014 09:59:46 -0500
Subject: [PATCH 2/4] use ngenerate for mapreducedim

---
 base/reducedim.jl | 79 +++++++++++++++++++----------------------------
 1 file changed, 31 insertions(+), 48 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 807e153f6cc5a..0a9fe46067561 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -132,60 +132,43 @@ function check_reducdims(R, A)
     return lsiz
 end
 
-function mapreducedim!_func(N::Int)
-    @eval begin
-        local _F_
-        function _F_(f, op, R, A)
-            lsiz = check_reducdims(R, A)
-            isempty(A) && return R
-            @nextract $N sizeR d->size(R,d)
-            sizA1 = size(A, 1)
-
-            if has_fast_linear_indexing(A) && lsiz > 16
-                # use mapreduce_impl, which is probably better tuned to achieve higher performance
-                nslices = div(length(A), lsiz)
-                ibase = 0
-                for i = 1:nslices
-                    @inbounds R[i] = mapreduce_impl(f, op, A, ibase+1, ibase+lsiz)
-                    ibase += lsiz
-                end
-            elseif size(R, 1) == 1 && sizA1 > 1
-                # keep the accumulator as a local variable when reducing along the first dimension
-                @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-                    @inbounds r = (@nref $N R j)
-                    for i_1 = 1:sizA1
-                        @inbounds v = evaluate(f, (@nref $N A i))
-                        r = evaluate(op, r, v)
-                    end
-                    @inbounds (@nref $N R j) = r
-                end 
-            else
-                # general implementation
-                @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+function mapreducedim_body(N::Int)
+    quote
+        lsiz = check_reducdims(R, A)
+        isempty(A) && return R
+        @nextract $N sizeR d->size(R,d)
+        sizA1 = size(A, 1)
+
+        if has_fast_linear_indexing(A) && lsiz > 16
+            # use mapreduce_impl, which is probably better tuned to achieve higher performance
+            nslices = div(length(A), lsiz)
+            ibase = 0
+            for i = 1:nslices
+                @inbounds R[i] = mapreduce_impl(f, op, A, ibase+1, ibase+lsiz)
+                ibase += lsiz
+            end
+        elseif size(R, 1) == 1 && sizA1 > 1
+            # keep the accumulator as a local variable when reducing along the first dimension
+            @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds r = (@nref $N R j)
+                for i_1 = 1:sizA1
                     @inbounds v = evaluate(f, (@nref $N A i))
-                    @inbounds (@nref $N R j) = evaluate(op, (@nref $N R j), v)
+                    r = evaluate(op, r, v)
                 end
+                @inbounds (@nref $N R j) = r
+            end 
+        else
+            # general implementation
+            @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds v = evaluate(f, (@nref $N A i))
+                @inbounds (@nref $N R j) = evaluate(op, (@nref $N R j), v)
             end
-            return R
         end
-        _F_
-    end  # @eval
-end
-
-let mapreducedim_fcache = (Int=>Function)[]
-global _mapreducedim!
-function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArray)
-    isempty(R) && return R
-    ndimsA = ndims(A)
-    if !haskey(mapreducedim_fcache, ndimsA)
-        func! = mapreducedim!_func(ndimsA)
-        mapreducedim_fcache[ndimsA] = func!
-    else
-        func! = mapreducedim_fcache[ndimsA]
+        return R        
     end
-    func!(f, op, R, A)::typeof(R)
 end
-end # let mapreducedim_fcache
+eval(ngenerate(:N, :(typeof(R)), 
+    :(_mapreducedim!{T,N}(f, op, R::AbstractArray, A::AbstractArray{T,N})), mapreducedim_body))
 
 mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) = _mapreducedim!(f, op, R, A)
 

From fe35eeee2f500f0f65ea0ee7fe526292f6b08157 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Wed, 4 Jun 2014 10:18:11 -0500
Subject: [PATCH 3/4] specialized methods to make the initialization for
 reducedim more efficient for common cases

---
 base/reducedim.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 0a9fe46067561..59ce195fe343f 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -95,6 +95,21 @@ reducedim_init{T}(f::Union(AbsFun,Abs2Fun), op::MaxFun, A::AbstractArray{T}, reg
 reducedim_init(f, op::AndFun, A::AbstractArray, region) = reducedim_initarray(A, region, true)
 reducedim_init(f, op::OrFun, A::AbstractArray, region) = reducedim_initarray(A, region, false)
 
+# specialize to make initialization more efficient for common cases
+
+typealias CommonReduceResult Union(Uint64,Uint128,Int64,Int128,Float32,Float64,Complex64,Complex128)
+
+for (IT, RT) in ((:CommonReduceResult, :T), (:SmallSigned, :Int), (:SmallUnsigned, :Uint))
+    @eval begin
+        reducedim_init{T<:$IT}(f::Union(IdFun,AbsFun,Abs2Fun), op::AddFun, A::AbstractArray{T}, region) = 
+            reducedim_initarray(A, region, zero($RT))
+        reducedim_init{T<:$IT}(f::Union(IdFun,AbsFun,Abs2Fun), op::MulFun, A::AbstractArray{T}, region) = 
+            reducedim_initarray(A, region, one($RT))
+    end    
+end
+reducedim_init(f::Union(IdFun,AbsFun,Abs2Fun), op::AddFun, A::AbstractArray{Bool}, region) = 
+    reducedim_initarray(A, region, 0)
+
 
 ## generic (map)reduction
 

From 99193d97a2e18f02f7f6212ad9ac9348b7959ad1 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Wed, 4 Jun 2014 11:38:03 -0500
Subject: [PATCH 4/4] use `@ngenerate` macro for mapreducedim!

---
 base/reducedim.jl | 62 ++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 59ce195fe343f..bc9450fc5d560 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -147,43 +147,39 @@ function check_reducdims(R, A)
     return lsiz
 end
 
-function mapreducedim_body(N::Int)
-    quote
-        lsiz = check_reducdims(R, A)
-        isempty(A) && return R
-        @nextract $N sizeR d->size(R,d)
-        sizA1 = size(A, 1)
-
-        if has_fast_linear_indexing(A) && lsiz > 16
-            # use mapreduce_impl, which is probably better tuned to achieve higher performance
-            nslices = div(length(A), lsiz)
-            ibase = 0
-            for i = 1:nslices
-                @inbounds R[i] = mapreduce_impl(f, op, A, ibase+1, ibase+lsiz)
-                ibase += lsiz
-            end
-        elseif size(R, 1) == 1 && sizA1 > 1
-            # keep the accumulator as a local variable when reducing along the first dimension
-            @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-                @inbounds r = (@nref $N R j)
-                for i_1 = 1:sizA1
-                    @inbounds v = evaluate(f, (@nref $N A i))
-                    r = evaluate(op, r, v)
-                end
-                @inbounds (@nref $N R j) = r
-            end 
-        else
-            # general implementation
-            @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
-                @inbounds v = evaluate(f, (@nref $N A i))
-                @inbounds (@nref $N R j) = evaluate(op, (@nref $N R j), v)
+@ngenerate N typeof(R) function _mapreducedim!{T,N}(f, op, R::AbstractArray, A::AbstractArray{T,N})
+    lsiz = check_reducdims(R, A)
+    isempty(A) && return R
+    @nextract N sizeR d->size(R,d)
+    sizA1 = size(A, 1)
+
+    if has_fast_linear_indexing(A) && lsiz > 16
+        # use mapreduce_impl, which is probably better tuned to achieve higher performance
+        nslices = div(length(A), lsiz)
+        ibase = 0
+        for i = 1:nslices
+            @inbounds R[i] = mapreduce_impl(f, op, A, ibase+1, ibase+lsiz)
+            ibase += lsiz
+        end
+    elseif size(R, 1) == 1 && sizA1 > 1
+        # keep the accumulator as a local variable when reducing along the first dimension
+        @nloops N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+            @inbounds r = (@nref N R j)
+            for i_1 = 1:sizA1
+                @inbounds v = evaluate(f, (@nref N A i))
+                r = evaluate(op, r, v)
             end
+            @inbounds (@nref N R j) = r
+        end 
+    else
+        # general implementation
+        @nloops N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+            @inbounds v = evaluate(f, (@nref N A i))
+            @inbounds (@nref N R j) = evaluate(op, (@nref N R j), v)
         end
-        return R        
     end
+    return R    
 end
-eval(ngenerate(:N, :(typeof(R)), 
-    :(_mapreducedim!{T,N}(f, op, R::AbstractArray, A::AbstractArray{T,N})), mapreducedim_body))
 
 mapreducedim!(f, op, R::AbstractArray, A::AbstractArray) = _mapreducedim!(f, op, R, A)