From 9446d4c22a8f95b767f8f5c0b2b88c071ac225b7 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 19 Jun 2018 11:47:27 -0700 Subject: [PATCH 1/4] Add tests that fail due to issue 11241 --- tests/python/gpu/test_operator_gpu.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 9e9cc608b299..1f39fd12c040 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -532,6 +532,47 @@ def test_convolution_options(): sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,1,1), pad=(0,0,0), cudnn_off=True, name='conv') check_consistency_NxM([sym, sym_no_cudnn], ctx_list) +# This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c. +# Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f). +@with_seed() +def test_convolution_large_c(): + def test_with_1D_size(width, grad_req): + ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, 65536, width), 'type_dict': {'conv_data': np.float32}}, + {'ctx': mx.gpu(0), 'conv_data': (1, 65536, width), 'type_dict': {'conv_data': np.float64}}] + sym = mx.sym.Convolution(layout='NCW', num_filter=8, kernel=(2,), name='conv') + # The convolution accumulates many values, so set a large tolerance. + tol = {np.dtype(np.float32): 1, + np.dtype(np.float64): 1} + check_consistency([sym, sym], ctx_list, tol=tol, grad_req='add') + + # Run with different data tensor shapes to run cudnnFind() multiple times. + # First, populate algo and op caches with models that use cudnnFind() exclusively. + for trial in range(0,6): + test_with_1D_size(2**(trial+1), 'write') + # Then create symbols that must avoid cached cudnnFind() results in some cases. + for trial in range(0,6): + test_with_1D_size(2**(trial+1), 'add') + +# This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c. +# Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f). +@with_seed() +def test_deconvolution_large_c(): + def test_with_1D_size(width, grad_req): + ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float32}}, + {'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float64}}] + sym = mx.sym.Deconvolution(layout='NCW', num_filter=65536, kernel=(2,), name='deconv') + # The convolution accumulates many values, so set a large tolerance. + tol = {np.dtype(np.float32): 1, + np.dtype(np.float64): 1} + check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) + + # Run with different data tensor shapes to run cudnnFind() multiple times. + # First, populate algo and op caches with models that use cudnnFind() exclusively. + for trial in range(0,6): + test_with_1D_size(2**(trial+1), 'write') + # Then create symbols that must avoid cached cudnnFind() results in some cases. + for trial in range(0,6): + test_with_1D_size(2**(trial+1), 'add') @with_seed() def test_convolution_versions(): From ca60250ce9df0bb839821ff5d4a5a553ce2bbb39 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Tue, 19 Jun 2018 12:28:32 -0700 Subject: [PATCH 2/4] Fix #11241 Conv1D throws CUDNN_STATUS_EXECUTION_FAILED --- src/operator/nn/convolution.cu | 20 ++++++++++++----- src/operator/nn/cudnn/cudnn_algoreg-inl.h | 11 +++++++--- src/operator/nn/cudnn/cudnn_convolution-inl.h | 22 +++++++++++++------ .../nn/cudnn/cudnn_deconvolution-inl.h | 20 ++++++++++++----- src/operator/nn/deconvolution.cu | 20 ++++++++++++----- src/operator/operator_common.h | 2 +- 6 files changed, 66 insertions(+), 29 deletions(-) diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index 797557e35df9..daccc5518efc 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -41,7 +41,8 @@ static CuDNNConvolutionOp& GetCuDNNConvOp(const ConvolutionParam& param, int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { #if DMLC_CXX11_THREAD_LOCAL static thread_local std::unordered_map >, @@ -57,14 +58,18 @@ static CuDNNConvolutionOp& GetCuDNNConvOp(const ConvolutionParam& param, ndim += s.ndim(); for (auto &s : out_shape) ndim += s.ndim(); - key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */ - + ndim + 1 /* for dev_id */); + key.Reserve(1 /* for forward_compute_type */ + + 1 /* for backward_compute_type */ + + ndim /* for in and out shapes */ + + 1 /* for dev_id */ + + 1 /* for add_to_weight */); key.AddSign(forward_compute_type); key.AddSign(backward_compute_type); key.AddSign(in_shape); key.AddSign(out_shape); key.AddSign(rctx.ctx.dev_id); + key.AddSign(add_to_weight ? 1 : 0); auto it = ops.find(key); if (it == ops.end()) { @@ -74,7 +79,7 @@ static CuDNNConvolutionOp& GetCuDNNConvOp(const ConvolutionParam& param, CHECK(ins_ret.second); it = ins_ret.first; it->second->Init(param, forward_compute_type, backward_compute_type, in_shape, - out_shape, rctx); + out_shape, rctx, add_to_weight); } return *it->second; } @@ -141,8 +146,10 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, std::vector out_shape(1, outputs[0].shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = inputs[i].shape_; + // req[conv::kWeight] is only set for backward, so assume the typical 'write' for now. + auto add_to_weight = false; CuDNNConvolutionOp &op = GetCuDNNConvOp(param, - compute_type, compute_type, in_shape, out_shape, ctx.run_ctx); + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight); op.Forward(ctx, inputs, req, outputs); } }) @@ -220,8 +227,9 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, std::vector out_shape(1, out_grad.shape_); for (size_t i = 0; i < in_shape.size(); i++) in_shape[i] = in_data[i].shape_; + auto add_to_weight = req[conv::kWeight] == kAddTo; CuDNNConvolutionOp &op = GetCuDNNConvOp(param, - compute_type, compute_type, in_shape, out_shape, ctx.run_ctx); + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight); op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h index e029c837bd4d..3b59fd1c3ced 100644 --- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h +++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h @@ -72,12 +72,13 @@ class CuDNNAlgoReg { cudnnDataType_t cudnn_forward_compute_type, cudnnDataType_t cudnn_backward_compute_type, int sm_arch, + bool add_to_weight, CuDNNAlgo *fwd, CuDNNAlgo *bwd, CuDNNAlgo *flt) { CHECK(in_shape.size() == 2 || in_shape.size() == 3); ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, - cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch}; + cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; std::lock_guard guard(lock_); auto i = reg_.find(key); if (i != reg_.end()) { @@ -96,12 +97,13 @@ class CuDNNAlgoReg { cudnnDataType_t cudnn_forward_compute_type, cudnnDataType_t cudnn_backward_compute_type, int sm_arch, + bool add_to_weight, const CuDNNAlgo &fwd, const CuDNNAlgo &bwd, const CuDNNAlgo &flt) { CHECK(in_shape.size() == 2 || in_shape.size() == 3); ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type, - cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch}; + cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight}; std::lock_guard guard(lock_); if (param.cudnn_tune.value() && reg_.size() % 50 == 0) { LOG(INFO) << "Running performance tests to find the best convolution " @@ -140,6 +142,7 @@ class CuDNNAlgoReg { cudnnDataType_t cudnn_forward_compute_type; cudnnDataType_t cudnn_backward_compute_type; int sm_arch; + bool add_to_weight; bool operator==(const ParamKey& other) const { return this->param == other.param && @@ -149,7 +152,8 @@ class CuDNNAlgoReg { this->cudnn_data_type == other.cudnn_data_type && this->cudnn_forward_compute_type == other.cudnn_forward_compute_type && this->cudnn_backward_compute_type == other.cudnn_backward_compute_type && - this->sm_arch == other.sm_arch; + this->sm_arch == other.sm_arch && + this->add_to_weight == other.add_to_weight; } }; @@ -164,6 +168,7 @@ class CuDNNAlgoReg { ret = dmlc::HashCombine(ret, static_cast(key.cudnn_forward_compute_type)); ret = dmlc::HashCombine(ret, static_cast(key.cudnn_backward_compute_type)); ret = dmlc::HashCombine(ret, key.sm_arch); + ret = dmlc::HashCombine(ret, key.add_to_weight); return ret; } }; diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 4b1cbbe7057b..c5fdf3d42b81 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -59,9 +59,11 @@ class CuDNNConvolutionOp { int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { using namespace mshadow; this->param_ = param; + this->add_to_weight_ = add_to_weight; InitBufferForParam(); auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type); auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); @@ -247,6 +249,7 @@ class CuDNNConvolutionOp { gbias.dptr_)); } if (req[conv::kWeight] != kNullOp) { + CHECK_EQ(add_to_weight_, req[conv::kWeight] == kAddTo); CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_, &alpha, in_desc_, @@ -610,8 +613,8 @@ class CuDNNConvolutionOp { cudnnDataType_t cudnn_backward_compute_type) { if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), &forward_algo_, &back_algo_, - &back_algo_w_)) { + SMArch(rctx.ctx.dev_id), add_to_weight_, + &forward_algo_, &back_algo_, &back_algo_w_)) { mshadow::Stream *s = rctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); size_t workspace_byte = static_cast(param_.workspace * sizeof(DType)); @@ -645,9 +648,11 @@ class CuDNNConvolutionOp { auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); std::vector bwd_filt_results(max_bwd_filt_algos); int actual_bwd_filter_algos = 0; - auto bwd_filter_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 - : cudnnFindConvolutionBackwardFilterAlgorithm; + // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we + // were summing into the output (i.e. beta != 0). Get() returned OK algos though. + auto bwd_filter_algo_discoverer = (add_to_weight_ || + param_.cudnn_tune.value() == conv::kOff) ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + : cudnnFindConvolutionBackwardFilterAlgorithm; CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, in_desc_, out_desc_, @@ -799,7 +804,8 @@ class CuDNNConvolutionOp { CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), this->forward_algo_, + SMArch(rctx.ctx.dev_id), this->add_to_weight_, + this->forward_algo_, this->back_algo_, this->back_algo_w_); } // If we're allowing Tensor Core variants of the algos to be considered in @@ -953,6 +959,8 @@ class CuDNNConvolutionOp { cudnnTensorFormat_t format_; // Allow TensorCore algo policy bool cudnn_tensor_core_; + // Is req[kWeight] == conv::kAddTo ? + bool add_to_weight_; ConvolutionParam param_; }; #endif // __CUDACC__ && CUDNN diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index cb0de4c961bf..85fe604b0817 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -56,9 +56,11 @@ class CuDNNDeconvolutionOp { int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { using namespace mshadow; this->param_ = param; + this->add_to_weight_ = add_to_weight; InitBufferForParam(); auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type); auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); @@ -257,6 +259,7 @@ class CuDNNDeconvolutionOp { filter_desc_, gwmat.dptr_ + weight_offset_ * g)); #elif CUDNN_MAJOR >= 5 + CHECK_EQ(add_to_weight_, req[deconv::kWeight] == kAddTo); CUDNN_CALL(cudnnConvolutionBackwardFilter( s->dnn_handle_, &alpha, @@ -543,8 +546,8 @@ class CuDNNDeconvolutionOp { if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), &forward_algo_, - &back_algo_, &back_algo_w_)) { + SMArch(rctx.ctx.dev_id), add_to_weight_, + &forward_algo_, &back_algo_, &back_algo_w_)) { mshadow::Stream *s = rctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); size_t workspace_byte = static_cast(param_.workspace * sizeof(DType)); @@ -578,8 +581,10 @@ class CuDNNDeconvolutionOp { auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_); std::vector bwd_filt_results(max_bwd_filt_algos); int actual_bwd_filter_algos = 0; - auto bwd_filter_algo_discoverer = - param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we + // were summing into the output (i.e. beta != 0). Get() returned OK algos though. + auto bwd_filter_algo_discoverer = (add_to_weight_ || + param_.cudnn_tune.value() == conv::kOff) ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 : cudnnFindConvolutionBackwardFilterAlgorithm; CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, out_desc_, @@ -735,7 +740,8 @@ class CuDNNDeconvolutionOp { CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_, cudnn_forward_compute_type, cudnn_backward_compute_type, - SMArch(rctx.ctx.dev_id), this->forward_algo_, + SMArch(rctx.ctx.dev_id), this->add_to_weight_, + this->forward_algo_, this->back_algo_, this->back_algo_w_); } // If we're allowing Tensor Core variants of the algos to be considered in @@ -912,6 +918,8 @@ class CuDNNDeconvolutionOp { cudnnTensorFormat_t format_; // Allow TensorCore algo policy bool cudnn_tensor_core_; + // Is req[kWeight] == deconv::kAddTo ? + bool add_to_weight_; DeconvolutionParam param_; }; #endif // CUDNN diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index cdfb606900bf..1c3970b9e716 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -39,7 +39,8 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p int backward_compute_type, const std::vector& in_shape, const std::vector& out_shape, - const RunContext& rctx) { + const RunContext& rctx, + bool add_to_weight) { #if DMLC_CXX11_THREAD_LOCAL static thread_local std::unordered_map >, @@ -55,14 +56,18 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p ndim += s.ndim(); for (auto &s : out_shape) ndim += s.ndim(); - key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */ - + ndim + 1 /* for dev_id */); + key.Reserve(1 /* for forward_compute_type */ + + 1 /* for backward_compute_type */ + + ndim /* for in and out shapes */ + + 1 /* for dev_id */ + + 1 /* for add_to_weight */); key.AddSign(forward_compute_type); key.AddSign(backward_compute_type); key.AddSign(in_shape); key.AddSign(out_shape); key.AddSign(rctx.ctx.dev_id); + key.AddSign(add_to_weight ? 1 : 0); auto it = ops.find(key); if (it == ops.end()) { @@ -72,7 +77,7 @@ static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& p CHECK(ins_ret.second); it = ins_ret.first; it->second->Init(param, forward_compute_type, backward_compute_type, in_shape, - out_shape, rctx); + out_shape, rctx, add_to_weight); } return *it->second; } @@ -109,8 +114,10 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, for (size_t i = 0; i < in_shape.size(); i++) { in_shape[i] = inputs[i].shape_; } + // req[deconv::kWeight] is only set for backward, so assume the typical 'write' for now. + auto add_to_weight = false; GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx).Forward(ctx, inputs, req, outputs); + in_shape, out_shape, ctx.run_ctx, add_to_weight).Forward(ctx, inputs, req, outputs); } }) #else @@ -156,8 +163,9 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, for (size_t i = 0; i < in_shape.size(); i++) { in_shape[i] = in_data[i].shape_; } + auto add_to_weight = req[deconv::kWeight] == kAddTo; GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx).Backward(ctx, + in_shape, out_shape, ctx.run_ctx, add_to_weight).Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); } }) diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h index 02130eb32e51..29112939a22f 100644 --- a/src/operator/operator_common.h +++ b/src/operator/operator_common.h @@ -494,7 +494,7 @@ inline void LogUnimplementedOp(const nnvm::NodeAttrs& attrs, } class OpSignature { - std::vector eles; + std::vector eles; uint64_t hash; public: From 99563ec2ea85a5e8b8c1dc02784965a18dd361ed Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Thu, 21 Jun 2018 17:29:19 -0700 Subject: [PATCH 3/4] Force algo 1 when grad_req==add with large c. Expand tests. --- src/operator/nn/cudnn/cudnn_convolution-inl.h | 26 +++++++- .../nn/cudnn/cudnn_deconvolution-inl.h | 26 +++++++- tests/python/gpu/test_operator_gpu.py | 60 ++++++++++++++----- 3 files changed, 91 insertions(+), 21 deletions(-) diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index c5fdf3d42b81..827c89faad1f 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -650,9 +650,9 @@ class CuDNNConvolutionOp { int actual_bwd_filter_algos = 0; // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we // were summing into the output (i.e. beta != 0). Get() returned OK algos though. - auto bwd_filter_algo_discoverer = (add_to_weight_ || - param_.cudnn_tune.value() == conv::kOff) ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 - : cudnnFindConvolutionBackwardFilterAlgorithm; + auto bwd_filter_algo_discoverer = + param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + : cudnnFindConvolutionBackwardFilterAlgorithm; CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, in_desc_, out_desc_, @@ -797,6 +797,13 @@ class CuDNNConvolutionOp { } } #endif // CUDNN_MAJOR < 7 + + // Fix for issue #11241 + int cudnn_find_issue_max_features = 64 * 1024; + if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) { + this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + } + // An algo specification by the user may be cached here, but another // convolution will match only if identically specified. // We're caching results of *Get* as well as *Find*, but these records @@ -927,6 +934,19 @@ class CuDNNConvolutionOp { return tensor.MSize() * sizeof(DType); } + // Given a tensor shape of this operation, return the number of features 'c' + int64_t Features(const TShape &dshape) { + int c = 0; + switch (dshape.ndim()) { + case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break; + case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break; + case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break; + default: + LOG(FATAL) << "Unexpected convolution data dimension " << dshape.ndim(); + } + return c; + } + std::vector param_stride_; std::vector param_dilate_; std::vector param_pad_; diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index 85fe604b0817..f1b40cce27f1 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -583,8 +583,8 @@ class CuDNNDeconvolutionOp { int actual_bwd_filter_algos = 0; // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we // were summing into the output (i.e. beta != 0). Get() returned OK algos though. - auto bwd_filter_algo_discoverer = (add_to_weight_ || - param_.cudnn_tune.value() == conv::kOff) ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 + auto bwd_filter_algo_discoverer = + param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7 : cudnnFindConvolutionBackwardFilterAlgorithm; CUDNN_CALL((*bwd_filter_algo_discoverer)(s->dnn_handle_, out_desc_, @@ -733,6 +733,14 @@ class CuDNNDeconvolutionOp { } } #endif // CUDNN_MAJOR < 7 + + // Fix for issue #11241 + int cudnn_find_issue_max_features = 64 * 1024; + // With deconvolution, the algo sensitivity is to a large number of output features + if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) { + this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true); + } + // An algo specification by the user may be cached here, but another // convolution will match only if identically specified. // We're caching results of *Get* as well as *Find*, but these records @@ -872,6 +880,20 @@ class CuDNNDeconvolutionOp { return tensor.MSize() * sizeof(DType); } + + // Given a tensor shape of this operation, return the number of features 'c' + int64_t Features(const TShape &dshape) { + int c = 0; + switch (dshape.ndim()) { + case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break; + case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break; + case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break; + default: + LOG(FATAL) << "Unexpected deconvolution data dimension " << dshape.ndim(); + } + return c; + } + std::vector param_stride_; std::vector param_dilate_; diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 1f39fd12c040..11aacdf72bc7 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -536,43 +536,71 @@ def test_convolution_options(): # Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f). @with_seed() def test_convolution_large_c(): - def test_with_1D_size(width, grad_req): - ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, 65536, width), 'type_dict': {'conv_data': np.float32}}, - {'ctx': mx.gpu(0), 'conv_data': (1, 65536, width), 'type_dict': {'conv_data': np.float64}}] + problematic_c = 64 * 1024 + # The convolution accumulates many values, so set large tolerances. + tol = {np.dtype(np.float32): 1, + np.dtype(np.float64): 1} + def test_1D_with_width(width, grad_req): + ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float32}}, + {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float64}}] sym = mx.sym.Convolution(layout='NCW', num_filter=8, kernel=(2,), name='conv') - # The convolution accumulates many values, so set a large tolerance. - tol = {np.dtype(np.float32): 1, - np.dtype(np.float64): 1} - check_consistency([sym, sym], ctx_list, tol=tol, grad_req='add') + check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) + + def test_2D_with_width(width, grad_req): + ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float32}}, + {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float64}}] + sym = mx.sym.Convolution(layout='NCHW', num_filter=4, kernel=(2,2), name='conv') + check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) # Run with different data tensor shapes to run cudnnFind() multiple times. # First, populate algo and op caches with models that use cudnnFind() exclusively. for trial in range(0,6): - test_with_1D_size(2**(trial+1), 'write') + test_1D_with_width(2**(trial+1), 'write') # Then create symbols that must avoid cached cudnnFind() results in some cases. for trial in range(0,6): - test_with_1D_size(2**(trial+1), 'add') + test_1D_with_width(2**(trial+1), 'add') + #2D + for trial in range(0,6): + test_2D_with_width(2**(trial+1), 'write') + # Then create symbols that must avoid cached cudnnFind() results in some cases. + for trial in range(0,6): + test_2D_with_width(2**(trial+1), 'add') + # This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c. # Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f). @with_seed() def test_deconvolution_large_c(): - def test_with_1D_size(width, grad_req): + problematic_c = 64 * 1024 + # The deconvolution accumulates many values, so set large tolerances. + tol = {np.dtype(np.float32): 1, + np.dtype(np.float64): 1} + def test_1D_with_width(width, grad_req): ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float32}}, {'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float64}}] - sym = mx.sym.Deconvolution(layout='NCW', num_filter=65536, kernel=(2,), name='deconv') - # The convolution accumulates many values, so set a large tolerance. - tol = {np.dtype(np.float32): 1, - np.dtype(np.float64): 1} + sym = mx.sym.Deconvolution(layout='NCW', num_filter=problematic_c, kernel=(2,), name='deconv') + check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) + + def test_2D_with_width(width, grad_req): + ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float32}}, + {'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float64}}] + sym = mx.sym.Deconvolution(layout='NCHW', num_filter=problematic_c, kernel=(2,2), name='deconv') check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) # Run with different data tensor shapes to run cudnnFind() multiple times. # First, populate algo and op caches with models that use cudnnFind() exclusively. for trial in range(0,6): - test_with_1D_size(2**(trial+1), 'write') + test_1D_with_width(2**(trial+1), 'write') + # Then create symbols that must avoid cached cudnnFind() results in some cases. + for trial in range(0,6): + test_1D_with_width(2**(trial+1), 'add') + #2D + for trial in range(0,6): + test_2D_with_width(2**(trial+1), 'write') # Then create symbols that must avoid cached cudnnFind() results in some cases. for trial in range(0,6): - test_with_1D_size(2**(trial+1), 'add') + test_2D_with_width(2**(trial+1), 'add') + @with_seed() def test_convolution_versions(): From 7dfe9e8ce410dbd0564461bb71b6455d63ebc997 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Mon, 16 Jul 2018 14:35:22 -0700 Subject: [PATCH 4/4] Shorten test runtimes. --- tests/python/gpu/test_operator_gpu.py | 38 ++++++++++----------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 11aacdf72bc7..acf9710e66cf 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -553,18 +553,13 @@ def test_2D_with_width(width, grad_req): check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) # Run with different data tensor shapes to run cudnnFind() multiple times. - # First, populate algo and op caches with models that use cudnnFind() exclusively. - for trial in range(0,6): - test_1D_with_width(2**(trial+1), 'write') - # Then create symbols that must avoid cached cudnnFind() results in some cases. - for trial in range(0,6): - test_1D_with_width(2**(trial+1), 'add') - #2D - for trial in range(0,6): - test_2D_with_width(2**(trial+1), 'write') - # Then create symbols that must avoid cached cudnnFind() results in some cases. - for trial in range(0,6): - test_2D_with_width(2**(trial+1), 'add') + # First, populate algo and op caches with models that always use cudnnFind() (req == 'write'). + # Then run models that must avoid cached cudnnFind() results in some cases (req == 'add'). + widths = [4, 16, 64] + for req in ['write', 'add']: + for width in widths: + test_1D_with_width(width, req) + test_2D_with_width(width, req) # This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c. @@ -588,18 +583,13 @@ def test_2D_with_width(width, grad_req): check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req) # Run with different data tensor shapes to run cudnnFind() multiple times. - # First, populate algo and op caches with models that use cudnnFind() exclusively. - for trial in range(0,6): - test_1D_with_width(2**(trial+1), 'write') - # Then create symbols that must avoid cached cudnnFind() results in some cases. - for trial in range(0,6): - test_1D_with_width(2**(trial+1), 'add') - #2D - for trial in range(0,6): - test_2D_with_width(2**(trial+1), 'write') - # Then create symbols that must avoid cached cudnnFind() results in some cases. - for trial in range(0,6): - test_2D_with_width(2**(trial+1), 'add') + # First, populate algo and op caches with models that always use cudnnFind() (req == 'write'). + # Then run models that must avoid cached cudnnFind() results in some cases (req == 'add'). + widths = [4, 16, 64] + for req in ['write', 'add']: + for width in widths: + test_1D_with_width(width, req) + test_2D_with_width(width, req) @with_seed()