From 0d9b3bd68405b9fb3606c498c46df74c4170cead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 15 Oct 2024 09:12:38 +0200 Subject: [PATCH] generic: sycl: fix accessor types --- src/gpu/generic/sycl/binary_kernels.hpp | 4 ++-- src/gpu/generic/sycl/convolution_kernels.hpp | 8 ++++---- src/gpu/generic/sycl/eltwise_kernels.hpp | 4 ++-- src/gpu/generic/sycl/matmul_kernels.hpp | 14 +++++++------- src/gpu/generic/sycl/ref_pooling.hpp | 1 + src/gpu/generic/sycl/ref_resampling.hpp | 1 + src/gpu/generic/sycl/reorder_kernels.hpp | 4 ++-- src/gpu/generic/sycl/resampling_kernels.hpp | 4 ++-- src/gpu/generic/sycl/softmax_kernels.hpp | 4 ++-- src/gpu/generic/sycl/sycl_post_ops.hpp | 16 ++++++++-------- src/xpu/sycl/types.hpp | 8 ++++++++ 11 files changed, 39 insertions(+), 29 deletions(-) diff --git a/src/gpu/generic/sycl/binary_kernels.hpp b/src/gpu/generic/sycl/binary_kernels.hpp index cf731efcc47..1b64ba8eff2 100644 --- a/src/gpu/generic/sycl/binary_kernels.hpp +++ b/src/gpu/generic/sycl/binary_kernels.hpp @@ -39,7 +39,7 @@ struct binary_kernel_vec_t { : conf_(conf) , src0_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0)) , src1_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_1)) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) , src0_scale_(CTX_IN_SYCL_KERNEL_MEMORY( DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)) , src1_scale_(CTX_IN_SYCL_KERNEL_MEMORY( @@ -200,7 +200,7 @@ struct binary_kernel_vec_t { xpu::sycl::in_memory_arg_t src0_; xpu::sycl::in_memory_arg_t src1_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; xpu::sycl::in_memory_arg_t src0_scale_; xpu::sycl::in_memory_arg_t src1_scale_; data_type_t scales_dt_; diff --git a/src/gpu/generic/sycl/convolution_kernels.hpp b/src/gpu/generic/sycl/convolution_kernels.hpp index 2f74d7914f1..507c576f349 100644 --- a/src/gpu/generic/sycl/convolution_kernels.hpp +++ b/src/gpu/generic/sycl/convolution_kernels.hpp @@ -39,7 +39,7 @@ struct convolution_kernel_fwd_t { , data_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0)) , weights_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS)) , bias_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_BIAS)) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) , data_scale_(CTX_IN_SYCL_KERNEL_MEMORY( DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)) , weights_scale_(CTX_IN_SYCL_KERNEL_MEMORY( @@ -244,7 +244,7 @@ struct convolution_kernel_fwd_t { xpu::sycl::in_memory_arg_t data_; xpu::sycl::in_memory_arg_t weights_; xpu::sycl::in_memory_arg_t bias_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; xpu::sycl::in_memory_arg_t data_scale_; xpu::sycl::in_memory_arg_t weights_scale_; xpu::sycl::in_memory_arg_t dst_scale_; @@ -262,7 +262,7 @@ struct convolution_kernel_bwd_data_t { convolution_kernel_bwd_data_t(const sycl_convolution_conf_t &conf, ::sycl::handler &cgh, const exec_ctx_t &ctx) : conf_(conf) - , diff_data_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC)) + , diff_data_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC)) , weights_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS)) , bias_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_BIAS)) , diff_dst_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_DST)) @@ -473,7 +473,7 @@ struct convolution_kernel_bwd_data_t { sycl_convolution_conf_t conf_; - xpu::sycl::out_memory_arg_t diff_data_; + xpu::sycl::inout_memory_arg_t diff_data_; xpu::sycl::in_memory_arg_t weights_; xpu::sycl::in_memory_arg_t bias_; xpu::sycl::in_memory_arg_t diff_dst_; diff --git a/src/gpu/generic/sycl/eltwise_kernels.hpp b/src/gpu/generic/sycl/eltwise_kernels.hpp index 0f1151ff7b6..19af141e4fb 100644 --- a/src/gpu/generic/sycl/eltwise_kernels.hpp +++ b/src/gpu/generic/sycl/eltwise_kernels.hpp @@ -37,7 +37,7 @@ struct eltwise_fwd_kernel_vec_t { : conf_(conf) , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC)) , po_args_(cgh, ctx) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) {} + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) {} void operator()(::sycl::nd_item<1> item) const { memory_tensor_t src_mem(src_, conf_.src_md); @@ -204,7 +204,7 @@ struct eltwise_fwd_kernel_vec_t { sycl_eltwise_conf_t conf_; xpu::sycl::in_memory_arg_t src_; post_op_input_args po_args_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; }; struct eltwise_bwd_kernel_vec_t { diff --git a/src/gpu/generic/sycl/matmul_kernels.hpp b/src/gpu/generic/sycl/matmul_kernels.hpp index c0d54b32b7e..0b3989056b7 100644 --- a/src/gpu/generic/sycl/matmul_kernels.hpp +++ b/src/gpu/generic/sycl/matmul_kernels.hpp @@ -83,7 +83,7 @@ struct matmul_kernel_fwd_t { } static void store_vec_helper( - out_memory_tensor_t &output, Vec data, int offset) { + inout_memory_tensor_t &output, Vec data, int offset) { data_type_t type = output.md().data_type(); char *offset_ptr = static_cast(output.ptr()) + data_type_size(type) * offset; @@ -189,7 +189,7 @@ struct matmul_kernel_fwd_t { } } - void store(out_memory_tensor_t &output, int offset, int row_stride) { + void store(inout_memory_tensor_t &output, int offset, int row_stride) { for (int row = 0; row < Rows; row++) { for (int col = 0; col < Cols / vec_len; col++) { store_vec_helper(output, data[row][col], @@ -198,8 +198,8 @@ struct matmul_kernel_fwd_t { } } - void store_edge(out_memory_tensor_t &output, int offset, int row_stride, - int rows, int cols) { + void store_edge(inout_memory_tensor_t &output, int offset, + int row_stride, int rows, int cols) { for (int row = 0; row < rows; row++) { int col; for (col = 0; col < cols / vec_len; col++) { @@ -215,7 +215,7 @@ struct matmul_kernel_fwd_t { } } - void store_generic(out_memory_tensor_t &output, int offset, + void store_generic(inout_memory_tensor_t &output, int offset, int row_stride, bool transpose, bool is_edge_block, int rows, int cols) { if (is_edge_block) { @@ -331,7 +331,7 @@ struct matmul_kernel_fwd_t { , data_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0)) , weights_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS)) , bias_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_BIAS)) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) , data_scale_(CTX_IN_SYCL_KERNEL_MEMORY( DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)) , data_scales_dt_((conf_.do_scale_data) @@ -618,7 +618,7 @@ struct matmul_kernel_fwd_t { xpu::sycl::in_memory_arg_t data_; xpu::sycl::in_memory_arg_t weights_; xpu::sycl::in_memory_arg_t bias_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; xpu::sycl::in_memory_arg_t data_scale_; data_type_t data_scales_dt_; xpu::sycl::in_memory_arg_t weights_scale_; diff --git a/src/gpu/generic/sycl/ref_pooling.hpp b/src/gpu/generic/sycl/ref_pooling.hpp index 692dfa589de..c5b670b700a 100644 --- a/src/gpu/generic/sycl/ref_pooling.hpp +++ b/src/gpu/generic/sycl/ref_pooling.hpp @@ -66,6 +66,7 @@ struct ref_pooling_fwd_t : public gpu::generic::sycl::primitive_t { src_md(0)->data_type != dst_md(0)->data_type, desc()->prop_kind == forward_inference)) && attr()->has_default_values(sm::post_ops) + && sycl_post_ops_t::post_ops_ok(attr(), true, false) && attr_.set_default_formats(dst_md(0)) == status::success && md_dims_in_range(src_md()); if (!ok) return status::unimplemented; diff --git a/src/gpu/generic/sycl/ref_resampling.hpp b/src/gpu/generic/sycl/ref_resampling.hpp index cc5a7c86605..b8921e053c9 100644 --- a/src/gpu/generic/sycl/ref_resampling.hpp +++ b/src/gpu/generic/sycl/ref_resampling.hpp @@ -50,6 +50,7 @@ struct ref_resampling_fwd_t : public gpu::generic::sycl::primitive_t { const bool ok = is_fwd() && is_supported_type(src_md(0)->data_type) && is_supported_type(dst_md(0)->data_type) && attr()->has_default_values(sm::post_ops) + && sycl_post_ops_t::post_ops_ok(attr()) && set_default_params() == status::success && attr_.set_default_formats(dst_md(0)) == status::success && (src_md(0)->format_desc.blocking.inner_nblks == 0) diff --git a/src/gpu/generic/sycl/reorder_kernels.hpp b/src/gpu/generic/sycl/reorder_kernels.hpp index 22181408b7b..4f6e08773fc 100644 --- a/src/gpu/generic/sycl/reorder_kernels.hpp +++ b/src/gpu/generic/sycl/reorder_kernels.hpp @@ -38,7 +38,7 @@ struct reorder_kernel_t { const exec_ctx_t &ctx) : conf_(conf) , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0)) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) , src_scale_(CTX_IN_SYCL_KERNEL_MEMORY( DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)) , dst_scale_(CTX_IN_SYCL_KERNEL_MEMORY( @@ -153,7 +153,7 @@ struct reorder_kernel_t { sycl_reorder_conf_t conf_; xpu::sycl::in_memory_arg_t src_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; xpu::sycl::in_memory_arg_t src_scale_; xpu::sycl::in_memory_arg_t dst_scale_; data_type_t scales_src_dt_; diff --git a/src/gpu/generic/sycl/resampling_kernels.hpp b/src/gpu/generic/sycl/resampling_kernels.hpp index ad5838320b0..9a8454402d3 100644 --- a/src/gpu/generic/sycl/resampling_kernels.hpp +++ b/src/gpu/generic/sycl/resampling_kernels.hpp @@ -38,7 +38,7 @@ struct resampling_kernel_fwd_vec_t { ::sycl::handler &cgh, const exec_ctx_t &ctx) : conf_(conf) , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC)) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) , po_args_(cgh, ctx) {} void operator()(::sycl::nd_item<1> item) const { @@ -142,7 +142,7 @@ struct resampling_kernel_fwd_vec_t { sycl_resampling_conf_t conf_; xpu::sycl::in_memory_arg_t src_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; post_op_input_args po_args_; }; diff --git a/src/gpu/generic/sycl/softmax_kernels.hpp b/src/gpu/generic/sycl/softmax_kernels.hpp index 6fcbd4b205d..2421e6dcb56 100644 --- a/src/gpu/generic/sycl/softmax_kernels.hpp +++ b/src/gpu/generic/sycl/softmax_kernels.hpp @@ -41,7 +41,7 @@ struct softmax_fwd_kernel_vec_t { DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC)) , scale_dst_(CTX_IN_SYCL_KERNEL_MEMORY( DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)) - , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) + , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) , po_args_(cgh, ctx) {} void operator()(::sycl::nd_item<1> item) const { @@ -150,7 +150,7 @@ struct softmax_fwd_kernel_vec_t { xpu::sycl::in_memory_arg_t src_; xpu::sycl::in_memory_arg_t scale_src_; xpu::sycl::in_memory_arg_t scale_dst_; - xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::inout_memory_arg_t dst_; post_op_input_args po_args_; }; diff --git a/src/gpu/generic/sycl/sycl_post_ops.hpp b/src/gpu/generic/sycl/sycl_post_ops.hpp index 2de1d1d1588..34254c801b6 100644 --- a/src/gpu/generic/sycl/sycl_post_ops.hpp +++ b/src/gpu/generic/sycl/sycl_post_ops.hpp @@ -174,7 +174,7 @@ struct ref_sum_op_t { ref_sum_op_t(float scale, float zeropoint) : scale_(scale), zeropoint_(zeropoint) {} - float load_and_compute(float acc, const xpu::sycl::out_memory_arg_t &dst, + float load_and_compute(float acc, const xpu::sycl::inout_memory_arg_t &dst, dnnl::impl::data_type_t sum_dt_, dim_t offset) const { // TODO dims32_t memory_plain_t dst_mem(dst, sum_dt_); @@ -265,14 +265,14 @@ struct sycl_post_ops_t { n_post_ops_ = attr_po.len(); } - inline float apply(float acc, const xpu::sycl::out_memory_arg_t &dst, + inline float apply(float acc, const xpu::sycl::inout_memory_arg_t &dst, dim_t dst_offset, const post_op_input_args &po_args, dims_t src_offset) const; inline float apply(float acc, float dst, const post_op_input_args &po_args, dims_t src_offset) const; inline float apply(float acc, const post_op_input_args &po_args, dims_t src_offset) const; - inline float apply(float acc, const xpu::sycl::out_memory_arg_t &dst, + inline float apply(float acc, const xpu::sycl::inout_memory_arg_t &dst, dim_t dst_offset) const; inline int get_post_op() const { return n_post_ops_; } @@ -311,9 +311,9 @@ struct post_op_input_args { xpu::sycl::in_memory_arg_t args_[sycl_post_ops_t::max_post_ops]; }; -float sycl_post_ops_t::apply(float acc, const xpu::sycl::out_memory_arg_t &dst, - dim_t dst_offset, const post_op_input_args &po_args, - dims_t src_offset) const { +float sycl_post_ops_t::apply(float acc, + const xpu::sycl::inout_memory_arg_t &dst, dim_t dst_offset, + const post_op_input_args &po_args, dims_t src_offset) const { using namespace primitive_kind; for (auto i = 0; i < n_post_ops_; ++i) { @@ -368,8 +368,8 @@ float sycl_post_ops_t::apply( return acc; } -float sycl_post_ops_t::apply(float acc, const xpu::sycl::out_memory_arg_t &dst, - dim_t dst_offset) const { +float sycl_post_ops_t::apply(float acc, + const xpu::sycl::inout_memory_arg_t &dst, dim_t dst_offset) const { using namespace primitive_kind; for (auto i = 0; i < n_post_ops_; ++i) { diff --git a/src/xpu/sycl/types.hpp b/src/xpu/sycl/types.hpp index 2852de1ecfc..a87bfe25221 100644 --- a/src/xpu/sycl/types.hpp +++ b/src/xpu/sycl/types.hpp @@ -48,6 +48,14 @@ namespace sycl { &CTX_OUT_STORAGE(arg)) \ ->get_out_memory_arg(ctx.stream(), cgh) +#define CTX_INOUT_SYCL_KERNEL_MEMORY(arg) \ + CTX_OUT_STORAGE(arg).is_null() \ + ? xpu::sycl::memory_storage_base_t::empty_inout_memory_arg( \ + ctx.stream(), cgh) \ + : utils::downcast( \ + &CTX_OUT_STORAGE(arg)) \ + ->get_inout_memory_arg(ctx.stream(), cgh) + #define CHECK_SYCL_KERNEL_ARG_TYPE(type) \ static_assert(::sycl::is_device_copyable_v)