From d4d58cf3b72abd295acc5ca9e56289f9d88852c0 Mon Sep 17 00:00:00 2001 From: Svetlozar Georgiev <55534064+sgeor255@users.noreply.github.com> Date: Fri, 9 Aug 2024 21:31:32 +0100 Subject: [PATCH] gpu: amd: enable SYCL kernels (#2024) Co-authored-by: Denis Samoilov --- cmake/options.cmake | 16 ++++++ examples/CMakeLists.txt | 4 ++ src/CMakeLists.txt | 4 ++ src/gpu/amd/README.md | 5 ++ src/gpu/generic/CMakeLists.txt | 5 +- src/gpu/generic/sycl/CMakeLists.txt | 4 ++ src/gpu/generic/sycl/README.md | 63 ++++++++++++++++++++++ src/gpu/gpu_batch_normalization_list.cpp | 5 +- src/gpu/gpu_binary_list.cpp | 5 +- src/gpu/gpu_convolution_list.cpp | 5 +- src/gpu/gpu_eltwise_list.cpp | 5 +- src/gpu/gpu_impl_list.hpp | 3 +- src/gpu/gpu_layer_normalization_list.cpp | 2 +- src/gpu/gpu_lrn_list.cpp | 5 +- src/gpu/gpu_pooling_list.cpp | 5 +- src/gpu/gpu_prelu_list.cpp | 2 +- src/gpu/gpu_reorder_list.cpp | 4 +- src/gpu/gpu_resampling_list.cpp | 5 +- src/gpu/gpu_shuffle_list.cpp | 2 +- src/gpu/gpu_softmax_list.cpp | 5 +- src/gpu/gpu_sum_list.cpp | 5 +- src/gpu/nvidia/README.md | 68 ++---------------------- 22 files changed, 146 insertions(+), 81 deletions(-) create mode 100644 src/gpu/generic/sycl/README.md diff --git a/cmake/options.cmake b/cmake/options.cmake index cd1c8be3c56..71e58d81d7e 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -159,6 +159,17 @@ set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING SSE41 < AVX2 < AVX512 < AMX (or ALL). It means that if user selects, e.g. AVX2 ISA, SSE41 kernels will also present at build time.") +set(DNNL_AMD_SYCL_KERNELS_TARGET_ARCH "" CACHE STRING + "Specifies the target architecture (e.g. gfx90a when compiling on AMD MI210) + to be used for compiling generic SYCL kernels for AMD vendor. + When this option is set to a valid architecture (see LLVM target column in + https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus + for supported architectures), the generic SYCL kernels will be enabled for AMD + vendor. If not set, the SYCL kernels will not be compiled. + Warning: This option is temporary and will be removed as soon as the compiler + stops to require specifying the target architecture. After removing the option + the generic SYCL kernels will always be enabled for AMD vendor.") + # ============= # Optimizations # ============= @@ -327,6 +338,11 @@ else() set(DNNL_WITH_SYCL false) endif() +if(DNNL_SYCL_HIP AND NOT "${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}" STREQUAL "") + add_definitions(-DDNNL_AMD_ENABLE_SYCL_KERNELS=1) + set(DNNL_AMD_ENABLE_SYCL_KERNELS TRUE) +endif() + # ============= # Miscellaneous # ============= diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 5a25d585a2b..cfdf8318812 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -27,6 +27,10 @@ if(DNNL_SYCL_CUDA) append(CMAKE_CXX_FLAGS "-Wno-linker-warnings") endif() +if (DNNL_AMD_ENABLE_SYCL_KERNELS) + append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}") +endif() + # propagate sanitizer flags append(CMAKE_C_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}") append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 34dd79441de..b32851872d5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -143,6 +143,10 @@ if(DNNL_WITH_SYCL) append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda") append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda") endif() + if(DNNL_AMD_ENABLE_SYCL_KERNELS) + append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}") + append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}") + endif() endif() if(ONEDNN_BUILD_GRAPH) diff --git a/src/gpu/amd/README.md b/src/gpu/amd/README.md index 27f34f901f8..c4fd1025946 100644 --- a/src/gpu/amd/README.md +++ b/src/gpu/amd/README.md @@ -343,3 +343,8 @@ The `miopenTransform` function is the equivalent of oneDNN reorder function. * Per dimension scaling is not supported (a single alpha and beta value is accepted by the transform tensor function). * Supported data types: `f32` + +### Other Primitives + +Some missing primitives/features are supported through +[generic SYCL kernels](../generic/sycl/README.md). diff --git a/src/gpu/generic/CMakeLists.txt b/src/gpu/generic/CMakeLists.txt index 252244e4100..df1c2fcfaf8 100644 --- a/src/gpu/generic/CMakeLists.txt +++ b/src/gpu/generic/CMakeLists.txt @@ -19,8 +19,9 @@ file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ) -# The generic SYCL kernels are currently enabled for NVIDIA vendor only. -if(DNNL_SYCL_CUDA) +# - Always enable generic SYCL kernels for NVIDIA vendor. +# - Only enable the generic SYCL kernels for AMD vendor if target architecture has been specified. +if(DNNL_SYCL_CUDA OR DNNL_AMD_ENABLE_SYCL_KERNELS) add_subdirectory(sycl) endif() diff --git a/src/gpu/generic/sycl/CMakeLists.txt b/src/gpu/generic/sycl/CMakeLists.txt index d8ac9461e26..6effe7c7faa 100644 --- a/src/gpu/generic/sycl/CMakeLists.txt +++ b/src/gpu/generic/sycl/CMakeLists.txt @@ -27,6 +27,10 @@ if(DNNL_SYCL_CUDA) append(CMAKE_CXX_FLAGS "-Wno-linker-warnings") endif() +if(DNNL_AMD_ENABLE_SYCL_KERNELS) + append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}") +endif() + set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_generic_sycl) add_library(${OBJ_LIB} OBJECT ${SOURCES}) set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS diff --git a/src/gpu/generic/sycl/README.md b/src/gpu/generic/sycl/README.md new file mode 100644 index 00000000000..c5441fe7fb9 --- /dev/null +++ b/src/gpu/generic/sycl/README.md @@ -0,0 +1,63 @@ +# Supported Primitives + +## Batch Normalization + +The implementation supports both forward and backward directions. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` + +## Eltwise + +The implementation supports both forward and backward directions. + +* Supported algorithms: `abs`, `clip`, `clip_v2`, `elu`, `exp`, `gelu_erf`, +`gelu_tanh`, `hardsigmoid`, `hardswish`, `linear`, `log`, `logistic`, `mish`, +`pow`, `relu`, `round`, `soft_relu`, `sqrt`, `square`,`swish` and `tanh` +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N` + +## LRN + +The implementation supports both forward and backward directions. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` + +## Pooling + +The implementation supports both forward and backward directions. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC` + +## PReLU + +The implementation supports both forward and backward propagations. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` + +* Forward pass supports `f32`, `f16`, `bf16`, `s8` and `u8` data types +* Backward pass supports `f32` and `bf16` data types + +## Reorder + +* Format support limitations: blocked formats are not supported +* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` + +## Resampling + +The implementation supports both forward and backward directions. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC` + +## Softmax/LogSoftmax + +The implementation supports both forward and backward directions. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` + +## Shuffle + +The implementation supports both forward and backward propagations. + +* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` + +* Forward pass supports `f32`, `f16`, `bf16` and `s8` data types. +* Backward pass supports `f32` and `bf16` data types. diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp index e37b4edd65a..6f3a68f032b 100644 --- a/src/gpu/gpu_batch_normalization_list.cpp +++ b/src/gpu/gpu_batch_normalization_list.cpp @@ -30,7 +30,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_batch_normalization.hpp" #include "gpu/nvidia/cudnn_batch_normalization.hpp" #endif @@ -38,6 +37,10 @@ #include "gpu/amd/miopen_batch_normalization.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_batch_normalization.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp index 965b9fc51fe..57c9a8084aa 100644 --- a/src/gpu/gpu_binary_list.cpp +++ b/src/gpu/gpu_binary_list.cpp @@ -23,7 +23,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_binary.hpp" #include "gpu/nvidia/cudnn_binary.hpp" #endif @@ -31,6 +30,10 @@ #include "gpu/amd/miopen_binary.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_binary.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp index f4a00690735..4268100759f 100644 --- a/src/gpu/gpu_convolution_list.cpp +++ b/src/gpu/gpu_convolution_list.cpp @@ -29,7 +29,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_convolution.hpp" #include "gpu/nvidia/cudnn_convolution.hpp" #endif @@ -37,6 +36,10 @@ #include "gpu/amd/miopen_convolution.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_convolution.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp index 9250e1c43a6..c609dfd9ca3 100644 --- a/src/gpu/gpu_eltwise_list.cpp +++ b/src/gpu/gpu_eltwise_list.cpp @@ -22,7 +22,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_eltwise.hpp" #include "gpu/nvidia/cudnn_eltwise.hpp" #endif @@ -30,6 +29,10 @@ #include "gpu/amd/miopen_eltwise.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_eltwise.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp index 992ac436428..2292d5a933b 100644 --- a/src/gpu/gpu_impl_list.hpp +++ b/src/gpu/gpu_impl_list.hpp @@ -77,7 +77,8 @@ namespace gpu { // NOTE: Support for the standalone GENERIC vendor has not been added yet. #if defined(DNNL_WITH_SYCL) \ && ((DNNL_GPU_VENDOR == DNNL_VENDOR_GENERIC) \ - || (DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA)) + || (DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA) \ + || (DNNL_AMD_ENABLE_SYCL_KERNELS == 1)) #define DNNL_GPU_GENERIC_SYCL_ONLY(...) __VA_ARGS__ #else #define DNNL_GPU_GENERIC_SYCL_ONLY(...) diff --git a/src/gpu/gpu_layer_normalization_list.cpp b/src/gpu/gpu_layer_normalization_list.cpp index 719f8df0129..31a5004c95f 100644 --- a/src/gpu/gpu_layer_normalization_list.cpp +++ b/src/gpu/gpu_layer_normalization_list.cpp @@ -24,7 +24,7 @@ #include "gpu/intel/ocl/vectorized_lnorm.hpp" #endif -#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD #include "gpu/generic/sycl/ref_layer_normalizations.hpp" #endif diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp index 40a6dd688c7..9c3ee48ccc3 100644 --- a/src/gpu/gpu_lrn_list.cpp +++ b/src/gpu/gpu_lrn_list.cpp @@ -21,7 +21,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_lrn.hpp" #include "gpu/nvidia/cudnn_lrn.hpp" #endif @@ -29,6 +28,10 @@ #include "gpu/amd/miopen_lrn.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_lrn.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp index 12da2459cc6..779dbb10958 100644 --- a/src/gpu/gpu_pooling_list.cpp +++ b/src/gpu/gpu_pooling_list.cpp @@ -26,7 +26,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_pooling.hpp" #include "gpu/nvidia/cudnn_pooling.hpp" #endif @@ -34,6 +33,10 @@ #include "gpu/amd/miopen_pooling.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_pooling.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_prelu_list.cpp b/src/gpu/gpu_prelu_list.cpp index a649197a6d2..f60dfa8d5d7 100644 --- a/src/gpu/gpu_prelu_list.cpp +++ b/src/gpu/gpu_prelu_list.cpp @@ -22,7 +22,7 @@ #include "gpu/intel/ocl/ref_prelu.hpp" #endif -#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD #include "gpu/generic/sycl/ref_prelu.hpp" #endif diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp index be52bad91f5..47af4d2832a 100644 --- a/src/gpu/gpu_reorder_list.cpp +++ b/src/gpu/gpu_reorder_list.cpp @@ -28,7 +28,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_reorder.hpp" #include "gpu/nvidia/cudnn_reorder.hpp" #endif @@ -36,6 +35,9 @@ #include "gpu/amd/miopen_reorder.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_reorder.hpp" +#endif namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp index e9a623c0438..bfbc068be95 100644 --- a/src/gpu/gpu_resampling_list.cpp +++ b/src/gpu/gpu_resampling_list.cpp @@ -22,10 +22,13 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_resampling.hpp" #include "gpu/nvidia/cudnn_resampling.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_resampling.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_shuffle_list.cpp b/src/gpu/gpu_shuffle_list.cpp index fe3ef9c2e9c..ee08ee002ca 100644 --- a/src/gpu/gpu_shuffle_list.cpp +++ b/src/gpu/gpu_shuffle_list.cpp @@ -21,7 +21,7 @@ #include "gpu/intel/ocl/shuffle_by_reorder.hpp" #endif -#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD #include "gpu/generic/sycl/ref_shuffle.hpp" #endif diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp index 63a00ddfbc4..1e10f9709df 100644 --- a/src/gpu/gpu_softmax_list.cpp +++ b/src/gpu/gpu_softmax_list.cpp @@ -23,7 +23,6 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/generic/sycl/ref_softmax.hpp" #include "gpu/nvidia/cudnn_softmax.hpp" #endif @@ -31,6 +30,10 @@ #include "gpu/amd/miopen_softmax.hpp" #endif +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD +#include "gpu/generic/sycl/ref_softmax.hpp" +#endif + namespace dnnl { namespace impl { namespace gpu { diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp index 1703a6b3909..ceb8966f9cf 100644 --- a/src/gpu/gpu_sum_list.cpp +++ b/src/gpu/gpu_sum_list.cpp @@ -30,9 +30,12 @@ #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA +#include "gpu/nvidia/cudnn_sum.hpp" +#endif + +#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD #include "gpu/generic/sycl/ref_sum.hpp" #include "gpu/generic/sycl/ref_sum_many_inputs.hpp" -#include "gpu/nvidia/cudnn_sum.hpp" #endif namespace dnnl { diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md index 45e2be0aebc..597adfaef37 100644 --- a/src/gpu/nvidia/README.md +++ b/src/gpu/nvidia/README.md @@ -115,13 +115,6 @@ normalization. `RELU`. * Backward pass supports `f32` and `bf16` data types. - -#### Using SYCL Kernels - -The implementation supports both forward and backward directions. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` - ##### Forward direction * Supported data types for source and destination: `f32`, `bf16`, `f16`, `s8` * Supported data types for mean and variance: `f32` @@ -210,15 +203,6 @@ limitations when using Nvidia backend for eltwise primitive: not supported for `s8`. * Backward pass supports `f32` and `bf16` data types. -#### Using SYCL Kernels - -The implementation supports both forward and backward directions. - -* Supported algorithms: `abs`, `clip`, `clip_v2`, `elu`, `exp`, `gelu_erf`, -`gelu_tanh`, `hardsigmoid`, `hardswish`, `linear`, `log`, `logistic`, `mish`, -`pow`, `relu`, `round`, `soft_relu`, `sqrt`, `square`,`swish` and `tanh` -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N` - ##### Forward Direction * Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8` and `u8` * Supported post-ops: `binary` @@ -284,12 +268,6 @@ backward propagation respectively. * cuDNN supports NCHW tensor formats for all valid dimensions. However, it does not support the NHWC tensor format for above 5 dimensions. -#### Using SYCL Kernels - -The implementation supports both forward and backward directions. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` - ##### Forward Direction * Supported data types: `f32`, `bf16`, `f16` * Supported algorithms: `ACROSS`, `WITHIN` @@ -332,12 +310,6 @@ backward propagation respectively. * Supported data type are `f32`, `f16`, `bf16` and `s8`. -#### Using SYCL Kernels - -The implementation supports both forward and backward directions. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC` - ##### Forward Direction * Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` and `s32` * Supported post-ops: `binary`, `eltwise_linear` @@ -347,17 +319,6 @@ The implementation supports both forward and backward directions. * Supported data types: `f32`, `bf16`, `f16` * Supported algorithms: `max`, `avg_p`, `avg_np` -### PReLU - -The PReLU primitive (Leaky ReLU with a trainable alpha parameter) is implemented -using SYCL kernels. The primitive supports both forward and backward -propagations. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` - -* Forward pass supports `f32`, `f16`, `bf16`, `s8` and `u8` data types -* Backward pass supports `f32` and `bf16` data types - ### Layer Normalization The Primitive layer normalization is implemented through SYCL kernels.The implementation supports both forward and backward directions. @@ -377,7 +338,7 @@ The Primitive layer normalization is implemented through SYCL kernels.The implem ### Reorder The `cudnnTransform` function is the equivalent of oneDNN reorder function. -However, there are some limitations when using SYCL_API-DNN reorder on Nvidia +However, there are some limitations when using reorder on Nvidia GPU: * Per dimension scaling is not supported (a single alpha and beta value is @@ -417,12 +378,6 @@ where all the data locations are normalized between `-1 <= (xi, yi) <= 1`. oneDNN, a dummy memory for `d_grid` is created and is deleted when the destructor of the primitive is called. -#### Using SYCL Kernels - -The implementation supports both forward and backward directions. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC` - ##### Forward Direction * Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` and `s32` * Supported post-ops: `sum`, `eltwise`, `binary` @@ -448,12 +403,6 @@ changed to `CUDNN_SOFTMAX_LOG`. * Forward pass supports `f32`, `f16`, `bf16` and `s8` data types. * Backward pass supports `f32` and `bf16` data types. -#### Using SYCL Kernels - -The implementation supports both forward and backward directions. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` - ##### Forward Direction * Supported data types: `f32`, `bf16`, `f16`, `u8` and `s8` * Supported scales: common scales for `s8` and `u8` data types @@ -466,18 +415,7 @@ The implementation supports both forward and backward directions. The sum operation uses the reorder primitive to sum tensors, so the same limitation as reorder applies here. -### Shuffle - -The shuffle primitive is implemented using SYCL kernels. -This primitive supports both forward and backward propagations. - -* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC` - -* Forward pass supports `f32`, `f16`, `bf16` and `s8` data types. -* Backward pass supports `f32` and `bf16` data types. - ### Other primitives -Rest primitives not listed above are not supported by Nvidia backend. This is -likely due to either missed functionality in cuDNN or cuBLAS, or lack of -priority in supporting of such functionality. +Some missing primitives/features are supported through +[generic SYCL kernels](../generic/sycl/README.md).