diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 092e613616d..313523197e4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -57,7 +57,6 @@ if(DNNL_SYCL_CUDA) ${CMAKE_CURRENT_SOURCE_DIR}/primitives/binary.cpp ${CMAKE_CURRENT_SOURCE_DIR}/primitives/lstm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/primitives/layer_normalization.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/primitives/prelu.cpp ${CMAKE_CURRENT_SOURCE_DIR}/primitives/reorder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/primitives/shuffle.cpp) endif() diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md index 6e9f0234708..828e61680df 100644 --- a/src/gpu/nvidia/README.md +++ b/src/gpu/nvidia/README.md @@ -276,6 +276,11 @@ backward propagation respectively. * Supported data type are `f32`, `f16`, `bf16` and `s8`. +### PReLU +The PReLU primitive (Leaky ReLU with a trainable alpha parameter) is implemented +using SYCL kernels. The primitive supports both forward and backward +propagations for the data types f32, s32, bf16, f16, s8 and u8. + ### Reorder The `cudnnTransform` function is the equivalent of oneDNN reorder function. diff --git a/src/gpu/nvidia/sycl_cuda_engine.cpp b/src/gpu/nvidia/sycl_cuda_engine.cpp index 125d7952874..f06acc5cd88 100644 --- a/src/gpu/nvidia/sycl_cuda_engine.cpp +++ b/src/gpu/nvidia/sycl_cuda_engine.cpp @@ -37,8 +37,8 @@ #include "gpu/nvidia/sycl_cuda_engine.hpp" #include "gpu/nvidia/sycl_cuda_scoped_context.hpp" #include "gpu/nvidia/sycl_cuda_stream.hpp" - #include "gpu/sycl/ref_binary.hpp" +#include "gpu/sycl/ref_prelu.hpp" namespace dnnl { namespace impl { @@ -215,6 +215,10 @@ constexpr dnnl::impl::impl_list_item_t sycl_cuda_impl_list[] = { INSTANCE(cudnn_batch_normalization_fwd_t) INSTANCE(cudnn_batch_normalization_bwd_t) + // PReLU + INSTANCE(sycl::ref_prelu_fwd_t) + INSTANCE(sycl::ref_prelu_bwd_t) + // Pooling INSTANCE(cudnn_pooling_fwd_t) INSTANCE(cudnn_pooling_bwd_t) diff --git a/src/gpu/sycl/prelu_kernels.hpp b/src/gpu/sycl/prelu_kernels.hpp new file mode 100644 index 00000000000..f03658f637a --- /dev/null +++ b/src/gpu/sycl/prelu_kernels.hpp @@ -0,0 +1,556 @@ +/******************************************************************************* +* Copyright 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_SYCL_PRELU_KERNELS_HPP +#define GPU_SYCL_PRELU_KERNELS_HPP + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/dnnl_traits.hpp" +#include "common/math_utils.hpp" +#include "common/memory_storage.hpp" +#include "common/utils.hpp" + +#include "gpu/sycl/sycl_io_helper.hpp" +#include "gpu/sycl/sycl_math_utils.hpp" +#include "gpu/sycl/sycl_post_ops.hpp" +#include "gpu/sycl/sycl_primitive_conf.hpp" +#include "gpu/sycl/sycl_q10n.hpp" +#include "gpu/sycl/sycl_types.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace sycl { + +static constexpr int max_supported_ndims = 5; + +struct prelu_fwd_kernel_vec_t { + static constexpr int vec_len = 8; + + prelu_fwd_kernel_vec_t(const sycl_prelu_conf_t &conf, + sycl_in_memory_arg_t &data, sycl_in_memory_arg_t &weights, + sycl_out_memory_arg_t &dst) + : conf_(conf), data_(data), weights_(weights), dst_(dst) {} + + void operator()(::sycl::nd_item<1> item) const { + + size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id(); + + const int mask = conf_.mask; + const dim_t work_amount = conf_.work_amount; + size_t nthr = conf_.n_thr; + if ((dim_t)ithr >= work_amount) return; + dim_t start {0}, end {0}; + dims_t dims_d, off; + + for (int j = 0; j < max_supported_ndims; j++) { + off[j] = 0; + dims_d[j] = (data_md().dims()[j] != 0) ? data_md().dims()[j] : 1; + } + + balance211(work_amount, nthr, ithr, start, end); + if (conf_.ndims == 1) { + utils::nd_iterator_init(start, off[0], dims_d[0]); + } else if (conf_.ndims == 2) { + utils::nd_iterator_init( + start, off[0], dims_d[0], off[1], dims_d[1]); + + } else if (conf_.ndims == 3) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2]); + } else if (conf_.ndims == 4) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3]); + } else if (conf_.ndims == 5) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3], off[4], dims_d[4]); + } + + for (dim_t iwork = start; iwork < end; ++iwork) { + dim_t data_off = offset(data_md(), off); + ; + dim_t weight_off = weights_offset(mask, weights_md(), off); + auto src_val = load_float_value( + data_md().data_type(), data_ptr(), data_off); + auto weights_val = load_float_value( + weights_md().data_type(), weights_ptr(), weight_off); + auto res = math::relu_fwd(src_val, weights_val); + store_float_value(data_md().data_type(), res, dst_ptr(), data_off); + if (conf_.ndims == 1) { + utils::nd_iterator_step(off[0], dims_d[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3], off[4], + dims_d[4]); + } + } + } + +private: + const sycl_md_t &data_md() const { return conf_.data_md; } + const sycl_md_t &weights_md() const { return conf_.weights_md; } + const sycl_md_t &dst_md() const { return conf_.dst_md; } + + void *data_ptr() const { return data_.get_pointer(); } + void *weights_ptr() const { return weights_.get_pointer(); } + void *dst_ptr() const { return dst_.get_pointer(); } + + static dim_t offset(const sycl_md_t &mem, dims_t dims) { + const int ndims = mem.ndims(); + switch (ndims) { + case 1: return mem.off(dims[0]); + case 2: return mem.off(dims[0], dims[1]); + case 3: return mem.off(dims[0], dims[1], dims[2]); + case 4: return mem.off(dims[0], dims[1], dims[2], dims[3]); + case 5: return mem.off(dims[0], dims[1], dims[2], dims[3], dims[4]); + default: assert(!"Unsupported ndims count"); + } + return -1; + } + + static dim_t weights_offset( + const int mask, const sycl_md_t &mem, dims_t &dims) { + dims_t dims_w {}; + std::copy(dims, dims + max_supported_ndims, dims_w); + utils::apply_mask_on_dims(dims_w, mem.ndims(), mask); + return offset(mem, dims_w); + } + + sycl_prelu_conf_t conf_; + sycl_in_memory_arg_t data_; + sycl_in_memory_arg_t weights_; + sycl_out_memory_arg_t dst_; +}; + +struct prelu_bwd_kernel_vec_t { + static constexpr int vec_len = 8; + + prelu_bwd_kernel_vec_t(const sycl_prelu_conf_t &conf, + sycl_in_memory_arg_t &data, sycl_out_memory_arg_t &diff_data, + sycl_in_memory_arg_t &weights, sycl_out_memory_arg_t &diff_weights, + sycl_in_memory_arg_t &diff_dst, sycl_out_memory_arg_t &scratchpad) + : conf_(conf) + , data_(data) + , diff_data_(diff_data) + , weights_(weights) + , diff_weights_(diff_weights) + , diff_dst_(diff_dst) + , scratchpad_(scratchpad) {} + + void operator()(::sycl::nd_item<1> item) const { + size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id(); + switch (conf_.bcast_type) { + case broadcasting_strategy_t::scalar: + calculate_scalar(data_ptr(), weights_ptr(), diff_weights_ptr(), + diff_dst_ptr(), diff_data_ptr(), ithr); + reduce_scalar(diff_weights_ptr(), ithr); + break; + case broadcasting_strategy_t::no_broadcast: + calculate_no_broadcast(data_ptr(), weights_ptr(), + diff_weights_ptr(), diff_dst_ptr(), diff_data_ptr(), + ithr); + break; + case broadcasting_strategy_t::per_oc: + case broadcasting_strategy_t::per_oc_spatial: + case broadcasting_strategy_t::per_mb_spatial: + case broadcasting_strategy_t::per_mb_w: + case broadcasting_strategy_t::per_w: + case broadcasting_strategy_t::shared_axes: + calculate_shared_axes(data_ptr(), weights_ptr(), + diff_weights_ptr(), diff_dst_ptr(), diff_data_ptr(), + ithr, item); + break; + default: assert(!"unsupported broadcast type"); + } + } + + float ker(const float *src, const float *weights, const float *diff_dst, + float *diff_src, dim_t data_off, dim_t weight_off) const { + + float src_val + = load_float_value(data_md().data_type(), data_ptr(), data_off); + float diff_dst_val = load_float_value( + diff_dst_md().data_type(), diff_dst_ptr(), data_off); + float weights_val = load_float_value( + weights_md().data_type(), weights_ptr(), weight_off); + + float diff_src_res = ::dnnl::impl::math::relu_bwd_use_dst( + diff_dst_val, src_val, weights_val); + float diff_weight_res = src_val > 0 ? 0 : (diff_dst_val * src_val); + store_float_value( + data_md().data_type(), diff_src_res, diff_src, data_off); + return diff_weight_res; + } + + void reduce_scalar(float *diff_weights, size_t i) const { + const size_t nthr = conf_.n_thr; + const dim_t work_amount = conf_.work_amount_src; + const int thread_count = nstl::min((dim_t)nthr, work_amount); + double s = 0; + int c = 0; + for (dim_t a = 0; a < thread_count; a++) { + auto la = load_float_value( + weights_md().data_type(), scratchpad_ptr(), i); + if (!std::isnan(la)) c = c + 1; + } + if (c == (thread_count)) { + for (dim_t j = 0; j < thread_count; j++) { + auto la = load_float_value( + weights_md().data_type(), scratchpad_ptr(), j); + s = s + la; + } + store_float_value( + weights_md().data_type(), s, diff_weights_ptr(), 0); + } + } + + void set_reduction_buffers( + const dim_t work_amount, dim_t &group_size, dim_t &buf_size) const { + float sqrt = std::sqrt(work_amount); + group_size = std::ceil(sqrt); + buf_size = std::floor(sqrt); + if (group_size * buf_size < work_amount) group_size++; + } + +private: + const sycl_md_t &data_md() const { return conf_.data_md; } + const sycl_md_t &weights_md() const { return conf_.weights_md; } + const sycl_md_t &diff_data_md() const { return conf_.diff_data_md; } + const sycl_md_t &diff_weights_md() const { return conf_.diff_weights_md; } + const sycl_md_t &diff_dst_md() const { return conf_.diff_dst_md; } + + float *data_ptr() const { return (float *)(data_.get_pointer()); } + float *weights_ptr() const { return (float *)(weights_.get_pointer()); } + float *diff_data_ptr() const { return (float *)(diff_data_.get_pointer()); } + float *diff_weights_ptr() const { + return (float *)(diff_weights_.get_pointer()); + } + float *diff_dst_ptr() const { return (float *)(diff_dst_.get_pointer()); } + float *scratchpad_ptr() const { + return (float *)(scratchpad_.get_pointer()); + } + + static dim_t offset(const sycl_md_t &mem, dims_t dims) { + const int ndims = mem.ndims(); + switch (ndims) { + case 1: return mem.off(dims[0]); + case 2: return mem.off(dims[0], dims[1]); + case 3: return mem.off(dims[0], dims[1], dims[2]); + case 4: return mem.off(dims[0], dims[1], dims[2], dims[3]); + case 5: return mem.off(dims[0], dims[1], dims[2], dims[3], dims[4]); + default: assert(!"Unsupported ndims count"); + } + return -1; + } + + static dim_t weights_offset( + const int mask, const sycl_md_t &mem, dims_t &dims) { + dims_t dims_w {}; + std::copy(dims, dims + max_supported_ndims, dims_w); + utils::apply_mask_on_dims(dims_w, mem.ndims(), mask); + return offset(mem, dims_w); + } + + void calculate_scalar(const float *src, const float *weights, + float *diff_weights, const float *diff_dst, float *diff_src, + size_t ithr) const { + + const size_t nthr = conf_.n_thr; + const dim_t work_amount = conf_.work_amount_src; + + if ((dim_t)ithr >= work_amount) return; + + dim_t start {0}, end {0}; + dims_t dims_d, off; + for (int i = 0; i < max_supported_ndims; i++) { + off[i] = 0; + dims_d[i] = (data_md().dims()[i] != 0) ? data_md().dims()[i] : 1; + } + + balance211(work_amount, nthr, ithr, start, end); + const dim_t workload = end - start; + + if (conf_.ndims == 1) { + utils::nd_iterator_init(start, off[0], dims_d[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_init( + start, off[0], dims_d[0], off[1], dims_d[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3], off[4], dims_d[4]); + } + + dim_t group_size, buf_size; + set_reduction_buffers(workload, group_size, buf_size); + dim_t offset_buf {0}, group_off {0}, data_size {buf_size}; + float s = 0; + float r = 0; + for (dim_t iwork = start; iwork < end; ++iwork) { + const auto data_off = offset(data_md(), off); + const auto weight_off = 0; + float src_val = load_float_value( + data_md().data_type(), data_ptr(), data_off); + float diff_dst_val = load_float_value( + diff_dst_md().data_type(), diff_dst_ptr(), data_off); + float weights_val = load_float_value( + weights_md().data_type(), weights_ptr(), weight_off); + float diff_src_res = ::dnnl::impl::math::relu_bwd_use_dst( + diff_dst_val, src_val, weights_val); + float diff_weight_res = src_val > 0 ? 0 : (diff_dst_val * src_val); + store_float_value( + data_md().data_type(), diff_src_res, diff_src, data_off); + + s = s + diff_weight_res; + if (++offset_buf == data_size) { + r = r + s; + offset_buf = 0; + group_off++; + s = 0; + data_size = ((group_off + 1) * buf_size <= workload) + ? buf_size + : workload - (group_off * buf_size); + } + if (conf_.ndims == 1) { + utils::nd_iterator_step(off[0], dims_d[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3], off[4], + dims_d[4]); + } + } + store_float_value(weights_md().data_type(), r, scratchpad_ptr(), ithr); + } + + void calculate_no_broadcast(const float *src, const float *weights, + float *diff_weights, const float *diff_dst, float *diff_src, + size_t ithr) const { + const size_t nthr = conf_.n_thr; + const dim_t work_amount = conf_.work_amount_src; + const int mask = conf_.mask; + + if ((dim_t)ithr >= work_amount) return; + + dim_t start {0}, end {0}; + dims_t dims_d, off; + for (int i = 0; i < max_supported_ndims; i++) { + off[i] = 0; + dims_d[i] = (data_md().dims()[i] != 0) ? data_md().dims()[i] : 1; + } + + balance211(work_amount, nthr, ithr, start, end); + if (conf_.ndims == 1) { + utils::nd_iterator_init(start, off[0], dims_d[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_init( + start, off[0], dims_d[0], off[1], dims_d[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_init(start, off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3], off[4], dims_d[4]); + } + + for (dim_t iwork = start; iwork < end; ++iwork) { + const auto data_off = offset(data_md(), off); + const auto weight_off = weights_offset(mask, weights_md(), off); + const auto res = ker( + src, weights, diff_dst, diff_src, data_off, weight_off); + + store_float_value( + weights_md().data_type(), res, diff_weights, weight_off); + if (conf_.ndims == 1) { + utils::nd_iterator_step(off[0], dims_d[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_step(off[0], dims_d[0], off[1], dims_d[1], + off[2], dims_d[2], off[3], dims_d[3], off[4], + dims_d[4]); + } + } + } + + void calculate_shared_axes(const float *src, const float *weights, + float *diff_weights, const float *diff_dst, float *diff_src, + size_t ith, ::sycl::nd_item<1> item) const { + + size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id(); + dims_t dims_d, dims_w; + for (int i = 0; i < max_supported_ndims; i++) { + dims_d[i] = (data_md().dims()[i] != 0) ? data_md().dims()[i] : 1; + dims_w[i] = (weights_md().dims()[i] != 0) ? weights_md().dims()[i] + : 1; + } + + const size_t nthr = conf_.n_thr; + const dim_t work_amount = conf_.work_amount; + if ((dim_t)ithr >= work_amount) return; + dim_t start {0}, end {0}; + balance211(work_amount, nthr, ithr, start, end); + dim_t group_size, buf_size; + const dim_t workload = conf_.work_load; + set_reduction_buffers(workload, group_size, buf_size); + + dims_t off_w, off_d, dims_start, dims_end; + + if (conf_.ndims == 1) { + utils::nd_iterator_init(start, off_w[0], dims_w[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_init( + start, off_w[0], dims_w[0], off_w[1], dims_w[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_init(start, off_w[0], dims_w[0], off_w[1], + dims_w[1], off_w[2], dims_w[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_init(start, off_w[0], dims_w[0], off_w[1], + dims_w[1], off_w[2], dims_w[2], off_w[3], dims_w[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_init(start, off_w[0], dims_w[0], off_w[1], + dims_w[1], off_w[2], dims_w[2], off_w[3], dims_w[3], + off_w[4], dims_w[4]); + } + + for (dim_t iwork = start; iwork < end; ++iwork) { + auto weight_off = offset(weights_md(), off_w); + + for (int i = 0; i < max_supported_ndims; i++) { + dims_start[i] = (dims_d[i] == dims_w[i]) ? off_w[i] : 0; + dims_end[i] + = (dims_d[i] == dims_w[i]) ? off_w[i] + 1 : dims_d[i]; + } + dim_t buf_off {0}, group_off {0}, data_size {buf_size}; + + float res = 0; + float st = 0; + for_(off_d[0] = dims_start[0]; off_d[0] < dims_end[0]; ++off_d[0]) + for_(off_d[1] = dims_start[1]; off_d[1] < dims_end[1]; ++off_d[1]) + for_(off_d[2] = dims_start[2]; off_d[2] < dims_end[2]; ++off_d[2]) + for_(off_d[3] = dims_start[3]; off_d[3] < dims_end[3]; ++off_d[3]) + for (off_d[4] = dims_start[4]; off_d[4] < dims_end[4]; ++off_d[4]) { + auto data_off = offset(data_md(), off_d); + const auto diff_weight = ker( + src, weights, diff_dst, diff_src, data_off, weight_off); + st = st + diff_weight; + buf_off = buf_off + 1; + if (buf_off == data_size) { + group_off = group_off + 1; + buf_off = 0; + data_size = ((group_off + 1) * buf_size <= workload) + ? buf_size + : workload - (group_off * buf_size); + + res = res + st; + st = 0; + } + } + + store_float_value(weights_md().data_type(), res, diff_weights_ptr(), + weight_off); + if (conf_.ndims == 1) { + utils::nd_iterator_step(off_w[0], dims_w[0]); + } + if (conf_.ndims == 2) { + utils::nd_iterator_step( + off_w[0], dims_w[0], off_w[1], dims_w[1]); + } + if (conf_.ndims == 3) { + utils::nd_iterator_step(off_w[0], dims_w[0], off_w[1], + dims_w[1], off_w[2], dims_w[2]); + } + if (conf_.ndims == 4) { + utils::nd_iterator_step(off_w[0], dims_w[0], off_w[1], + dims_w[1], off_w[2], dims_w[2], off_w[3], dims_w[3]); + } + if (conf_.ndims == 5) { + utils::nd_iterator_step(off_w[0], dims_w[0], off_w[1], + dims_w[1], off_w[2], dims_w[2], off_w[3], dims_w[3], + off_w[4], dims_w[4]); + } + } + } + + sycl_prelu_conf_t conf_; + sycl_in_memory_arg_t data_; + sycl_out_memory_arg_t diff_data_; + sycl_in_memory_arg_t weights_; + sycl_out_memory_arg_t diff_weights_; + sycl_in_memory_arg_t diff_dst_; + sycl_out_memory_arg_t scratchpad_; +}; + +} // namespace sycl +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/sycl/ref_prelu.cpp b/src/gpu/sycl/ref_prelu.cpp new file mode 100644 index 00000000000..f2128694ecb --- /dev/null +++ b/src/gpu/sycl/ref_prelu.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* +* Copyright 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/sycl/ref_prelu.hpp" +#include "common/utils.hpp" +#include "gpu/sycl/prelu_kernels.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace sycl { + +using namespace impl::sycl; + +status_t ref_prelu_fwd_t::pd_t::init_conf() { + if (has_zero_dim_memory()) return status::success; + + conf_ = sycl_prelu_conf_t(); + + const memory_desc_wrapper data_d(src_md(0)); + const memory_desc_wrapper weights_d(weights_md(0)); + conf_.data_md = sycl_md_t(src_md(0)); + conf_.weights_md = sycl_md_t(weights_md(0)); + conf_.dst_md = sycl_md_t(dst_md(0)); + conf_.ndims = ndims(); + conf_.mask = utils::get_dims_mask(data_d.dims(), weights_d.dims(), ndims()); + + conf_.block_size = 16; + conf_.wg_size = 32; + conf_.work_amount = memory_desc_wrapper(src_md(0)).nelems(); + conf_.work_amount_wei = memory_desc_wrapper(weights_md(0)).nelems(); + int work_per_wg = conf_.wg_size * conf_.block_size; + int n_wgs = (conf_.work_amount + work_per_wg - 1) / work_per_wg; + conf_.n_thr = n_wgs * conf_.wg_size; + + return status::success; +} + +status_t ref_prelu_fwd_t::init(engine_t *engine) { + const auto kid = ::sycl::get_kernel_id(); + return create_kernel(engine, kid, &kernel_); +} + +status_t ref_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const { + if (pd()->has_zero_dim_memory()) return status::success; + + return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) { + auto data = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC); + auto weights = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS); + auto dst = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST); + auto nelems_A = memory_desc_wrapper(pd()->src_md(0)).nelems(); + int tot_work = nelems_A; + prelu_fwd_kernel_vec_t prelu_fwd_kernel( + pd()->conf_, data, weights, dst); + const int block_size = pd()->conf_.block_size; + const int wg_size = pd()->conf_.wg_size; + int work_per_wg = wg_size * block_size; + int n_wgs = (tot_work + work_per_wg - 1) / work_per_wg; + int n_thr = n_wgs * wg_size; + cgh.parallel_for(::sycl::nd_range<1>(n_thr, wg_size), prelu_fwd_kernel); + }); +} + +status_t ref_prelu_bwd_t::pd_t::init_conf() { + if (has_zero_dim_memory()) return status::success; + conf_ = sycl_prelu_conf_t(); + conf_.data_md = sycl_md_t(src_md(0)); + conf_.weights_md = sycl_md_t(weights_md(0)); + conf_.diff_data_md = sycl_md_t(diff_src_md(0)); + conf_.diff_weights_md = sycl_md_t(diff_weights_md(0)); + conf_.diff_dst_md = sycl_md_t(diff_dst_md(0)); + conf_.ndims = ndims(); + + const memory_desc_wrapper weights_d(weights_md(0)); + const memory_desc_wrapper data_d(src_md(0)); + conf_.bcast_type = dnnl::impl::get_rhs_arg_broadcasting_strategy( + *weights_d.md_, data_d); + conf_.mask = utils::get_dims_mask(data_d.dims(), weights_d.dims(), ndims()); + conf_.block_size = 16; + conf_.wg_size = 32; + conf_.work_amount_src = memory_desc_wrapper(src_md(0)).nelems(); + conf_.work_amount = memory_desc_wrapper(weights_md(0)).nelems(); + conf_.work_load = conf_.work_amount_src / conf_.work_amount; + int work_per_wg = conf_.wg_size * conf_.block_size; + int n_wgs = (conf_.work_amount_src + work_per_wg - 1) / work_per_wg; + conf_.n_thr = n_wgs * conf_.wg_size; + + return status::success; +} + +status_t ref_prelu_bwd_t::init(engine_t *engine) { + const auto kid = ::sycl::get_kernel_id(); + return create_kernel(engine, kid, &kernel_); +} + +status_t ref_prelu_bwd_t::execute_backward(const exec_ctx_t &ctx) const { + if (pd()->has_zero_dim_memory()) return status::success; + + return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) { + auto data = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC); + auto weights = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS); + auto diff_data = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC); + auto diff_weights = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_WEIGHTS); + auto diff_dst = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_DST); + auto scratchpad = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_SCRATCHPAD); + auto nelems_A = memory_desc_wrapper(pd()->src_md(0)).nelems(); + int tot_work = nelems_A; + + prelu_bwd_kernel_vec_t prelu_bwd_kernel(pd()->conf_, data, diff_data, + weights, diff_weights, diff_dst, scratchpad); + const int block_size = pd()->conf_.block_size; + const int wg_size = pd()->conf_.wg_size; + int work_per_wg = wg_size * block_size; + int n_wgs = (tot_work + work_per_wg - 1) / work_per_wg; + int n_thr = n_wgs * wg_size; + cgh.parallel_for(::sycl::nd_range<1>(n_thr, wg_size), prelu_bwd_kernel); + }); +} + +} // namespace sycl +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/sycl/ref_prelu.hpp b/src/gpu/sycl/ref_prelu.hpp new file mode 100644 index 00000000000..5ce0ae22090 --- /dev/null +++ b/src/gpu/sycl/ref_prelu.hpp @@ -0,0 +1,121 @@ +/******************************************************************************* +* Copyright 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_SYCL_REF_PRELU_HPP +#define GPU_SYCL_REF_PRELU_HPP + +#include "common/broadcast_strategy.hpp" +#include "gpu/gpu_prelu_pd.hpp" +#include "gpu/sycl/prelu_kernels.hpp" +#include "gpu/sycl/sycl_gpu_primitive.hpp" +#include "gpu/sycl/sycl_io_helper.hpp" +#include "gpu/sycl/sycl_post_ops.hpp" +#include "gpu/sycl/sycl_primitive_conf.hpp" +#include "gpu/sycl/sycl_q10n.hpp" +#include "gpu/sycl/sycl_types.hpp" +#include "sycl/sycl_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace sycl { + +struct ref_prelu_fwd_t : public sycl_gpu_primitive_t { + using sycl_gpu_primitive_t::sycl_gpu_primitive_t; + + struct pd_t : public gpu_prelu_fwd_pd_t { + using gpu_prelu_fwd_pd_t::gpu_prelu_fwd_pd_t; + + DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_prelu_fwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + + const memory_desc_wrapper data_d(src_md(0)); + const memory_desc_wrapper weights_d(weights_md(0)); + const memory_desc_wrapper dst_d(dst_md(0)); + + const bool ok = is_fwd() && set_default_formats() + && (src_md(0)->format_desc.blocking.inner_nblks == 0) + && (weights_md(0)->format_desc.blocking.inner_nblks == 0); + + if (!ok) return status::unimplemented; + return init_conf(); + } + + status_t init_conf(); + sycl_prelu_conf_t conf_; + }; + + status_t init(engine_t *engine) override; + status_t execute(const exec_ctx_t &ctx) const override { + return execute_forward(ctx); + } + +private: + status_t execute_forward(const exec_ctx_t &ctx) const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + compute::kernel_t kernel_; +}; + +struct ref_prelu_bwd_t : public sycl_gpu_primitive_t { + using sycl_gpu_primitive_t::sycl_gpu_primitive_t; + + struct pd_t : public gpu_prelu_bwd_pd_t { + using gpu_prelu_bwd_pd_t::gpu_prelu_bwd_pd_t; + + DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_prelu_bwd_t); + + status_t init(engine_t *engine) { + using namespace data_type; + const memory_desc_wrapper data_d(src_md(0)); + const memory_desc_wrapper weights_d(weights_md(0)); + const memory_desc_wrapper diff_data_d(diff_src_md(0)); + const memory_desc_wrapper diff_weights_d(diff_weights_md(0)); + const memory_desc_wrapper diff_dst_d(diff_dst_md(0)); + + const bool ok = !is_fwd() && set_default_formats() + && (src_md(0)->format_desc.blocking.inner_nblks == 0) + && (weights_md(0)->format_desc.blocking.inner_nblks == 0) + && diff_src_md(0)->data_type == src_md(0)->data_type + && diff_weights_md(0)->data_type + == weights_md(0)->data_type; + + if (!ok) return status::unimplemented; + return init_conf(); + } + + status_t init_conf(); + sycl_prelu_conf_t conf_; + }; + + status_t init(engine_t *engine) override; + status_t execute(const exec_ctx_t &ctx) const override { + return execute_backward(ctx); + } + +private: + status_t execute_backward(const exec_ctx_t &ctx) const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + compute::kernel_t kernel_; +}; + +} // namespace sycl +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/sycl/sycl_primitive_conf.hpp b/src/gpu/sycl/sycl_primitive_conf.hpp index 3037a054fe4..8f4e6071c62 100644 --- a/src/gpu/sycl/sycl_primitive_conf.hpp +++ b/src/gpu/sycl/sycl_primitive_conf.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Intel Corporation +* Copyright 2022-2023 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #ifndef GPU_SYCL_SYCL_PRIMITIVE_CONF_HPP #define GPU_SYCL_SYCL_PRIMITIVE_CONF_HPP +#include "common/broadcast_strategy.hpp" #include "gpu/sycl/sycl_post_ops.hpp" #include "gpu/sycl/sycl_types.hpp" @@ -43,6 +44,30 @@ struct sycl_binary_conf_t { sycl_post_ops_t post_ops; }; +struct sycl_prelu_conf_t { + prop_kind_t prop_kind; + sycl_md_t data_md; + sycl_md_t dst_md; + sycl_md_t weights_md; + sycl_md_t diff_data_md; + sycl_md_t diff_dst_md; + sycl_md_t diff_weights_md; + dim_t work_amount; + dim_t work_amount_wei; + dim_t work_amount_src; + dim_t work_load; + bool reduce_diff_weights = 0; + int mask; + float sum; + broadcasting_strategy_t bcast_type; + int ndims; + int block_size; + int wg_size; + size_t n_thr; + size_t i_thr; +}; + +CHECK_SYCL_KERNEL_ARG_TYPE(sycl_prelu_conf_t); CHECK_SYCL_KERNEL_ARG_TYPE(sycl_binary_conf_t); } // namespace sycl diff --git a/src/gpu/sycl/sycl_types.hpp b/src/gpu/sycl/sycl_types.hpp index edadb7fcb0c..9c41d16db1d 100644 --- a/src/gpu/sycl/sycl_types.hpp +++ b/src/gpu/sycl/sycl_types.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2022 Intel Corporation +* Copyright 2022-2023 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -133,6 +133,12 @@ struct sycl_md_t { #undef CHECK_AND_ASSIGN } + template + dim_t off(Args... args) const { + dims_t pos = {args...}; + return off_v(pos, false); + } + dim_t off_v(const dims_t pos, bool is_pos_padded = false) const { dims_t pos_copy = {0}; for (int d = 0; d < ndims(); ++d)