diff --git a/src/gpu/nvidia/cudnn_convolution.cpp b/src/gpu/nvidia/cudnn_convolution.cpp index 8aa5be577ea..0bbebc76c57 100644 --- a/src/gpu/nvidia/cudnn_convolution.cpp +++ b/src/gpu/nvidia/cudnn_convolution.cpp @@ -54,11 +54,8 @@ status_t cudnn_convolution_fwd_t::execute_convolution( if (pd()->use_temp_dst()) { memory_storage_t *temp_dst_mem = scratch_storage.get(); - memory_storage_t *temp_reorder_mem = scratch_storage_2.get(); temp_dst = xpu::sycl::interop_memory_arg_t< ::sycl::access::mode::read_write>(temp_dst_mem, cgh); - temp_reorder = xpu::sycl::interop_memory_arg_t< - ::sycl::access::mode::read_write>(temp_reorder_mem, cgh); } xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write> @@ -85,7 +82,6 @@ status_t cudnn_convolution_fwd_t::execute_convolution( args.push_back(arg_scratch.get_native_pointer(ih)); args.push_back(arg_filter_scratch.get_native_pointer(ih)); args.push_back(temp_dst.get_native_pointer(ih)); - args.push_back(temp_reorder.get_native_pointer(ih)); args.push_back(arg_src_scale.get_native_pointer(ih)); args.push_back(arg_wei_scale.get_native_pointer(ih)); args.push_back(arg_dst_scale.get_native_pointer(ih)); diff --git a/src/gpu/nvidia/cudnn_convolution.hpp b/src/gpu/nvidia/cudnn_convolution.hpp index 01f4f2f7fbe..329081c4633 100644 --- a/src/gpu/nvidia/cudnn_convolution.hpp +++ b/src/gpu/nvidia/cudnn_convolution.hpp @@ -176,10 +176,6 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t { CHECK(sycl_engine->create_memory_storage( &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr)); scratch_storage.reset(scratch_ptr); - - CHECK(sycl_engine->create_memory_storage( - &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr)); - scratch_storage_2.reset(scratch_ptr); } if (impl && impl->use_scales_dst()) { CHECK(sycl_engine->create_memory_storage( diff --git a/src/gpu/nvidia/cudnn_convolution_impl.hpp b/src/gpu/nvidia/cudnn_convolution_impl.hpp index c4d91d61206..ebb81db6b16 100644 --- a/src/gpu/nvidia/cudnn_convolution_impl.hpp +++ b/src/gpu/nvidia/cudnn_convolution_impl.hpp @@ -511,14 +511,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { const float beta = 0.0f; if (flip_formats) { CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, - reorder_dst_desc, src, &beta, descs[y], dst); + reorder_dst_desc, src, &beta, y_fp32_desc, dst); } else { - CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y], - src, &beta, reorder_dst_desc, dst); + CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, + y_fp32_desc, src, &beta, reorder_dst_desc, dst); } } - void execute_f32_sum(cudnnHandle_t handle, void *y, void *y_fp32_data, + void execute_f32_dst_sum(cudnnHandle_t handle, void *y, void *y_fp32_data, float alpha_, float beta_) const { float alpha1 = 0.0f; float alpha2 = alpha_; @@ -528,6 +528,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { y_fp32_data); } + void execute_f32_src_sum(cudnnHandle_t handle, void *x, void *y, + float alpha_, float beta_) const { + float alpha = alpha_; + float beta = beta_; + CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x, + &beta, y_fp32_desc, y); + } + void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const { float alpha = 1.0f; float beta = 0.0f; @@ -551,8 +559,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { const std::vector &args) const override { auto x = args[0], weights = args[1], y = args[2], bias = args[3], scratchpad = args[4], post_op_scratch = args[6], - post_op_reorder = args[7], src_scale = args[8], - wei_scale = args[9], dst_scale = args[10]; + src_scale = args[7], wei_scale = args[8], dst_scale = args[9]; void *output = use_temp_dst_ ? post_op_scratch : y; if (using_transformed_filter()) { auto w_scratch = args[5]; @@ -561,7 +568,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { } float *y_fp32_data = nullptr; - if (y_f32_is_required()) { y_fp32_data = (float *)args[11]; } + if (y_f32_is_required()) { y_fp32_data = (float *)args[10]; } bool fused = conv_bias || conv_bias_eltwise; @@ -581,7 +588,8 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { } } - auto &y_desc = y_f32_is_required() ? y_fp32_desc : descs[io::y]; + auto &y_desc = (y_f32_is_required() || use_temp_dst_) ? y_fp32_desc + : descs[io::y]; void *y_data = y_f32_is_required() ? y_fp32_data : output; if (fused) { @@ -619,12 +627,11 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { switch (post_ops[i]) { case dnnl_sum: if (need_reorder) { - execute_reorder(handle, y, post_op_reorder, true); - execute_sum(handle, post_op_reorder, post_op_scratch, - sum_scale, 1.0f); + execute_f32_src_sum( + handle, y, post_op_scratch, sum_scale, 1.0f); } else if (last_op) { if (y_f32_is_required()) { - execute_f32_sum( + execute_f32_dst_sum( handle, y, y_fp32_data, 1.0f, sum_scale); } else { execute_sum(handle, post_op_scratch, y, 1.0f, @@ -687,7 +694,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t { // The scratchpad size will need to be modified in // cases where the dst_scaling is used and the output // uses s8 values. - if (use_scales_dst_) { + if (use_scales_dst_ || use_temp_dst_) { CHECK(create_and_set_tensor_descriptor(&y_fp32_desc, CUDNN_DATA_FLOAT, ndims[y], dims[y], strides[y])); CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,