From 7486ed83f72fa60ad84f5cbf549c4128ea8de8be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= <tadej.ciglaric@codeplay.com>
Date: Wed, 9 Oct 2024 14:29:19 +0200
Subject: [PATCH] gpu: nvidia: convolution: bugfix sum post op with int8

---
 src/gpu/nvidia/cudnn_convolution.cpp      |  4 ---
 src/gpu/nvidia/cudnn_convolution.hpp      |  4 ---
 src/gpu/nvidia/cudnn_convolution_impl.hpp | 33 ++++++++++++++---------
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/src/gpu/nvidia/cudnn_convolution.cpp b/src/gpu/nvidia/cudnn_convolution.cpp
index 8aa5be577ea..0bbebc76c57 100644
--- a/src/gpu/nvidia/cudnn_convolution.cpp
+++ b/src/gpu/nvidia/cudnn_convolution.cpp
@@ -54,11 +54,8 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
 
         if (pd()->use_temp_dst()) {
             memory_storage_t *temp_dst_mem = scratch_storage.get();
-            memory_storage_t *temp_reorder_mem = scratch_storage_2.get();
             temp_dst = xpu::sycl::interop_memory_arg_t<
                     ::sycl::access::mode::read_write>(temp_dst_mem, cgh);
-            temp_reorder = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>(temp_reorder_mem, cgh);
         }
 
         xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
@@ -85,7 +82,6 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
             args.push_back(arg_scratch.get_native_pointer(ih));
             args.push_back(arg_filter_scratch.get_native_pointer(ih));
             args.push_back(temp_dst.get_native_pointer(ih));
-            args.push_back(temp_reorder.get_native_pointer(ih));
             args.push_back(arg_src_scale.get_native_pointer(ih));
             args.push_back(arg_wei_scale.get_native_pointer(ih));
             args.push_back(arg_dst_scale.get_native_pointer(ih));
diff --git a/src/gpu/nvidia/cudnn_convolution.hpp b/src/gpu/nvidia/cudnn_convolution.hpp
index 01f4f2f7fbe..329081c4633 100644
--- a/src/gpu/nvidia/cudnn_convolution.hpp
+++ b/src/gpu/nvidia/cudnn_convolution.hpp
@@ -176,10 +176,6 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t {
             CHECK(sycl_engine->create_memory_storage(
                     &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
             scratch_storage.reset(scratch_ptr);
-
-            CHECK(sycl_engine->create_memory_storage(
-                    &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
-            scratch_storage_2.reset(scratch_ptr);
         }
         if (impl && impl->use_scales_dst()) {
             CHECK(sycl_engine->create_memory_storage(
diff --git a/src/gpu/nvidia/cudnn_convolution_impl.hpp b/src/gpu/nvidia/cudnn_convolution_impl.hpp
index c4d91d61206..ebb81db6b16 100644
--- a/src/gpu/nvidia/cudnn_convolution_impl.hpp
+++ b/src/gpu/nvidia/cudnn_convolution_impl.hpp
@@ -511,14 +511,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
         const float beta = 0.0f;
         if (flip_formats) {
             CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
-                    reorder_dst_desc, src, &beta, descs[y], dst);
+                    reorder_dst_desc, src, &beta, y_fp32_desc, dst);
         } else {
-            CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y],
-                    src, &beta, reorder_dst_desc, dst);
+            CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
+                    y_fp32_desc, src, &beta, reorder_dst_desc, dst);
         }
     }
 
-    void execute_f32_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
+    void execute_f32_dst_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
             float alpha_, float beta_) const {
         float alpha1 = 0.0f;
         float alpha2 = alpha_;
@@ -528,6 +528,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
                 y_fp32_data);
     }
 
+    void execute_f32_src_sum(cudnnHandle_t handle, void *x, void *y,
+            float alpha_, float beta_) const {
+        float alpha = alpha_;
+        float beta = beta_;
+        CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x,
+                &beta, y_fp32_desc, y);
+    }
+
     void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const {
         float alpha = 1.0f;
         float beta = 0.0f;
@@ -551,8 +559,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
             const std::vector<void *> &args) const override {
         auto x = args[0], weights = args[1], y = args[2], bias = args[3],
              scratchpad = args[4], post_op_scratch = args[6],
-             post_op_reorder = args[7], src_scale = args[8],
-             wei_scale = args[9], dst_scale = args[10];
+             src_scale = args[7], wei_scale = args[8], dst_scale = args[9];
         void *output = use_temp_dst_ ? post_op_scratch : y;
         if (using_transformed_filter()) {
             auto w_scratch = args[5];
@@ -561,7 +568,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
         }
 
         float *y_fp32_data = nullptr;
-        if (y_f32_is_required()) { y_fp32_data = (float *)args[11]; }
+        if (y_f32_is_required()) { y_fp32_data = (float *)args[10]; }
 
         bool fused = conv_bias || conv_bias_eltwise;
 
@@ -581,7 +588,8 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
             }
         }
 
-        auto &y_desc = y_f32_is_required() ? y_fp32_desc : descs[io::y];
+        auto &y_desc = (y_f32_is_required() || use_temp_dst_) ? y_fp32_desc
+                                                              : descs[io::y];
         void *y_data = y_f32_is_required() ? y_fp32_data : output;
 
         if (fused) {
@@ -619,12 +627,11 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
             switch (post_ops[i]) {
                 case dnnl_sum:
                     if (need_reorder) {
-                        execute_reorder(handle, y, post_op_reorder, true);
-                        execute_sum(handle, post_op_reorder, post_op_scratch,
-                                sum_scale, 1.0f);
+                        execute_f32_src_sum(
+                                handle, y, post_op_scratch, sum_scale, 1.0f);
                     } else if (last_op) {
                         if (y_f32_is_required()) {
-                            execute_f32_sum(
+                            execute_f32_dst_sum(
                                     handle, y, y_fp32_data, 1.0f, sum_scale);
                         } else {
                             execute_sum(handle, post_op_scratch, y, 1.0f,
@@ -687,7 +694,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
         // The scratchpad size will need to be modified in
         // cases where the dst_scaling is used and the output
         // uses s8 values.
-        if (use_scales_dst_) {
+        if (use_scales_dst_ || use_temp_dst_) {
             CHECK(create_and_set_tensor_descriptor(&y_fp32_desc,
                     CUDNN_DATA_FLOAT, ndims[y], dims[y], strides[y]));
             CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,