From 9a4f95e345c87d4b00696c291d6fc55016520097 Mon Sep 17 00:00:00 2001
From: Svetlozar Georgiev <svetlozar.georgiev@codeplay.com>
Date: Wed, 7 Aug 2024 16:44:26 +0100
Subject: [PATCH] gpu: amd: Enable SYCL kernels

---
 cmake/options.cmake                      | 13 +++++
 examples/CMakeLists.txt                  |  4 ++
 src/CMakeLists.txt                       |  4 ++
 src/gpu/amd/README.md                    |  5 ++
 src/gpu/generic/CMakeLists.txt           |  7 ++-
 src/gpu/generic/sycl/CMakeLists.txt      |  4 ++
 src/gpu/generic/sycl/README.md           | 63 ++++++++++++++++++++++
 src/gpu/gpu_batch_normalization_list.cpp |  5 +-
 src/gpu/gpu_binary_list.cpp              |  5 +-
 src/gpu/gpu_convolution_list.cpp         |  5 +-
 src/gpu/gpu_eltwise_list.cpp             |  5 +-
 src/gpu/gpu_impl_list.hpp                |  3 +-
 src/gpu/gpu_layer_normalization_list.cpp |  2 +-
 src/gpu/gpu_lrn_list.cpp                 |  5 +-
 src/gpu/gpu_pooling_list.cpp             |  5 +-
 src/gpu/gpu_prelu_list.cpp               |  2 +-
 src/gpu/gpu_reorder_list.cpp             |  4 +-
 src/gpu/gpu_resampling_list.cpp          |  5 +-
 src/gpu/gpu_shuffle_list.cpp             |  2 +-
 src/gpu/gpu_softmax_list.cpp             |  5 +-
 src/gpu/gpu_sum_list.cpp                 |  5 +-
 src/gpu/nvidia/README.md                 | 68 ++----------------------
 22 files changed, 146 insertions(+), 80 deletions(-)
 create mode 100644 src/gpu/generic/sycl/README.md

diff --git a/cmake/options.cmake b/cmake/options.cmake
index cd1c8be3c56..a87ac7a481b 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -159,6 +159,14 @@ set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
       SSE41 < AVX2 < AVX512 < AMX (or ALL). It means that if user selects, e.g.
       AVX2 ISA, SSE41 kernels will also present at build time.")
 
+set(DNNL_AMD_SYCL_KERNELS_TARGET_ARCH "" CACHE STRING
+    "Specifies the target architecture (e.g. gfx90a when compiling on AMD MI210)
+    to be used for compiling generic SYCL kernels for AMD vendor.
+    When this option is set to a valid architecture (see LLVM target column in
+    https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus
+    for supported architectures), the generic SYCL kernels will be enabled for AMD
+    vendor. If not set, the SYCL kernels will not be compiled.")
+
 # =============
 # Optimizations
 # =============
@@ -327,6 +335,11 @@ else()
     set(DNNL_WITH_SYCL false)
 endif()
 
+if(DNNL_SYCL_HIP AND NOT "${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}" STREQUAL "")
+    add_definitions(-DDNNL_AMD_ENABLE_SYCL_KERNELS=1)
+    set(DNNL_AMD_ENABLE_SYCL_KERNELS TRUE)
+endif()
+
 # =============
 # Miscellaneous
 # =============
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5a25d585a2b..cfdf8318812 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -27,6 +27,10 @@ if(DNNL_SYCL_CUDA)
     append(CMAKE_CXX_FLAGS "-Wno-linker-warnings")
 endif()
 
+if (DNNL_AMD_ENABLE_SYCL_KERNELS)
+    append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
+endif()
+
 # propagate sanitizer flags
 append(CMAKE_C_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
 append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 34dd79441de..b32851872d5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -143,6 +143,10 @@ if(DNNL_WITH_SYCL)
         append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda")
         append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda")
     endif()
+    if(DNNL_AMD_ENABLE_SYCL_KERNELS)
+        append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
+        append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
+    endif()
 endif()
 
 if(ONEDNN_BUILD_GRAPH)
diff --git a/src/gpu/amd/README.md b/src/gpu/amd/README.md
index 27f34f901f8..c4fd1025946 100644
--- a/src/gpu/amd/README.md
+++ b/src/gpu/amd/README.md
@@ -343,3 +343,8 @@ The `miopenTransform` function is the equivalent of oneDNN reorder function.
 * Per dimension scaling is not supported (a single alpha and beta value is
   accepted by the transform tensor function).
 * Supported data types: `f32`
+
+### Other Primitives
+
+Some missing primitives/features are supported through
+[generic SYCL kernels](../generic/sycl/README.md).
diff --git a/src/gpu/generic/CMakeLists.txt b/src/gpu/generic/CMakeLists.txt
index 252244e4100..0c3ded76e25 100644
--- a/src/gpu/generic/CMakeLists.txt
+++ b/src/gpu/generic/CMakeLists.txt
@@ -19,11 +19,16 @@ file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
     )
 
-# The generic SYCL kernels are currently enabled for NVIDIA vendor only.
 if(DNNL_SYCL_CUDA)
     add_subdirectory(sycl)
 endif()
 
+# Only enable the generic SYCL kernels for AMD vendor
+# if target architecture has been specified
+if(DNNL_AMD_ENABLE_SYCL_KERNELS)
+    add_subdirectory(sycl)
+endif()
+
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_generic)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
diff --git a/src/gpu/generic/sycl/CMakeLists.txt b/src/gpu/generic/sycl/CMakeLists.txt
index d8ac9461e26..6effe7c7faa 100644
--- a/src/gpu/generic/sycl/CMakeLists.txt
+++ b/src/gpu/generic/sycl/CMakeLists.txt
@@ -27,6 +27,10 @@ if(DNNL_SYCL_CUDA)
     append(CMAKE_CXX_FLAGS "-Wno-linker-warnings")
 endif()
 
+if(DNNL_AMD_ENABLE_SYCL_KERNELS)
+    append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
+endif()
+
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_generic_sycl)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
diff --git a/src/gpu/generic/sycl/README.md b/src/gpu/generic/sycl/README.md
new file mode 100644
index 00000000000..c5441fe7fb9
--- /dev/null
+++ b/src/gpu/generic/sycl/README.md
@@ -0,0 +1,63 @@
+# Supported Primitives
+
+## Batch Normalization
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+
+## Eltwise
+
+The implementation supports both forward and backward directions.
+
+* Supported algorithms: `abs`, `clip`, `clip_v2`, `elu`, `exp`, `gelu_erf`,
+`gelu_tanh`, `hardsigmoid`, `hardswish`, `linear`, `log`, `logistic`, `mish`,
+`pow`, `relu`, `round`, `soft_relu`, `sqrt`, `square`,`swish` and `tanh`
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N`
+
+## LRN
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+
+## Pooling
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`
+
+## PReLU
+
+The implementation supports both forward and backward propagations.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+
+* Forward pass supports `f32`, `f16`, `bf16`, `s8` and `u8` data types
+* Backward pass supports `f32` and `bf16` data types
+
+## Reorder
+
+* Format support limitations: blocked formats are not supported
+* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8`
+
+## Resampling
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`
+
+## Softmax/LogSoftmax
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+
+## Shuffle
+
+The implementation supports both forward and backward propagations.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+
+* Forward pass supports `f32`, `f16`, `bf16` and `s8` data types.
+* Backward pass supports `f32` and `bf16` data types.
diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp
index e37b4edd65a..6f3a68f032b 100644
--- a/src/gpu/gpu_batch_normalization_list.cpp
+++ b/src/gpu/gpu_batch_normalization_list.cpp
@@ -30,7 +30,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_batch_normalization.hpp"
 #include "gpu/nvidia/cudnn_batch_normalization.hpp"
 #endif
 
@@ -38,6 +37,10 @@
 #include "gpu/amd/miopen_batch_normalization.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_batch_normalization.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp
index 965b9fc51fe..57c9a8084aa 100644
--- a/src/gpu/gpu_binary_list.cpp
+++ b/src/gpu/gpu_binary_list.cpp
@@ -23,7 +23,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_binary.hpp"
 #include "gpu/nvidia/cudnn_binary.hpp"
 #endif
 
@@ -31,6 +30,10 @@
 #include "gpu/amd/miopen_binary.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_binary.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp
index f4a00690735..4268100759f 100644
--- a/src/gpu/gpu_convolution_list.cpp
+++ b/src/gpu/gpu_convolution_list.cpp
@@ -29,7 +29,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_convolution.hpp"
 #include "gpu/nvidia/cudnn_convolution.hpp"
 #endif
 
@@ -37,6 +36,10 @@
 #include "gpu/amd/miopen_convolution.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_convolution.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp
index 9250e1c43a6..c609dfd9ca3 100644
--- a/src/gpu/gpu_eltwise_list.cpp
+++ b/src/gpu/gpu_eltwise_list.cpp
@@ -22,7 +22,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_eltwise.hpp"
 #include "gpu/nvidia/cudnn_eltwise.hpp"
 #endif
 
@@ -30,6 +29,10 @@
 #include "gpu/amd/miopen_eltwise.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_eltwise.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp
index 992ac436428..2292d5a933b 100644
--- a/src/gpu/gpu_impl_list.hpp
+++ b/src/gpu/gpu_impl_list.hpp
@@ -77,7 +77,8 @@ namespace gpu {
 // NOTE: Support for the standalone GENERIC vendor has not been added yet.
 #if defined(DNNL_WITH_SYCL) \
         && ((DNNL_GPU_VENDOR == DNNL_VENDOR_GENERIC) \
-                || (DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA))
+                || (DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA) \
+                || (DNNL_AMD_ENABLE_SYCL_KERNELS == 1))
 #define DNNL_GPU_GENERIC_SYCL_ONLY(...) __VA_ARGS__
 #else
 #define DNNL_GPU_GENERIC_SYCL_ONLY(...)
diff --git a/src/gpu/gpu_layer_normalization_list.cpp b/src/gpu/gpu_layer_normalization_list.cpp
index 719f8df0129..31a5004c95f 100644
--- a/src/gpu/gpu_layer_normalization_list.cpp
+++ b/src/gpu/gpu_layer_normalization_list.cpp
@@ -24,7 +24,7 @@
 #include "gpu/intel/ocl/vectorized_lnorm.hpp"
 #endif
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
 #include "gpu/generic/sycl/ref_layer_normalizations.hpp"
 #endif
 
diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp
index 40a6dd688c7..9c3ee48ccc3 100644
--- a/src/gpu/gpu_lrn_list.cpp
+++ b/src/gpu/gpu_lrn_list.cpp
@@ -21,7 +21,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_lrn.hpp"
 #include "gpu/nvidia/cudnn_lrn.hpp"
 #endif
 
@@ -29,6 +28,10 @@
 #include "gpu/amd/miopen_lrn.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_lrn.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp
index 12da2459cc6..779dbb10958 100644
--- a/src/gpu/gpu_pooling_list.cpp
+++ b/src/gpu/gpu_pooling_list.cpp
@@ -26,7 +26,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_pooling.hpp"
 #include "gpu/nvidia/cudnn_pooling.hpp"
 #endif
 
@@ -34,6 +33,10 @@
 #include "gpu/amd/miopen_pooling.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_pooling.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_prelu_list.cpp b/src/gpu/gpu_prelu_list.cpp
index a649197a6d2..f60dfa8d5d7 100644
--- a/src/gpu/gpu_prelu_list.cpp
+++ b/src/gpu/gpu_prelu_list.cpp
@@ -22,7 +22,7 @@
 #include "gpu/intel/ocl/ref_prelu.hpp"
 #endif
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
 #include "gpu/generic/sycl/ref_prelu.hpp"
 #endif
 
diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp
index be52bad91f5..47af4d2832a 100644
--- a/src/gpu/gpu_reorder_list.cpp
+++ b/src/gpu/gpu_reorder_list.cpp
@@ -28,7 +28,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_reorder.hpp"
 #include "gpu/nvidia/cudnn_reorder.hpp"
 #endif
 
@@ -36,6 +35,9 @@
 #include "gpu/amd/miopen_reorder.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_reorder.hpp"
+#endif
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp
index e9a623c0438..bfbc068be95 100644
--- a/src/gpu/gpu_resampling_list.cpp
+++ b/src/gpu/gpu_resampling_list.cpp
@@ -22,10 +22,13 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_resampling.hpp"
 #include "gpu/nvidia/cudnn_resampling.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_resampling.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_shuffle_list.cpp b/src/gpu/gpu_shuffle_list.cpp
index fe3ef9c2e9c..ee08ee002ca 100644
--- a/src/gpu/gpu_shuffle_list.cpp
+++ b/src/gpu/gpu_shuffle_list.cpp
@@ -21,7 +21,7 @@
 #include "gpu/intel/ocl/shuffle_by_reorder.hpp"
 #endif
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
 #include "gpu/generic/sycl/ref_shuffle.hpp"
 #endif
 
diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp
index 63a00ddfbc4..1e10f9709df 100644
--- a/src/gpu/gpu_softmax_list.cpp
+++ b/src/gpu/gpu_softmax_list.cpp
@@ -23,7 +23,6 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_softmax.hpp"
 #include "gpu/nvidia/cudnn_softmax.hpp"
 #endif
 
@@ -31,6 +30,10 @@
 #include "gpu/amd/miopen_softmax.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_softmax.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp
index 1703a6b3909..ceb8966f9cf 100644
--- a/src/gpu/gpu_sum_list.cpp
+++ b/src/gpu/gpu_sum_list.cpp
@@ -30,9 +30,12 @@
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+#include "gpu/nvidia/cudnn_sum.hpp"
+#endif
+
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
 #include "gpu/generic/sycl/ref_sum.hpp"
 #include "gpu/generic/sycl/ref_sum_many_inputs.hpp"
-#include "gpu/nvidia/cudnn_sum.hpp"
 #endif
 
 namespace dnnl {
diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md
index 45e2be0aebc..597adfaef37 100644
--- a/src/gpu/nvidia/README.md
+++ b/src/gpu/nvidia/README.md
@@ -115,13 +115,6 @@ normalization.
   `RELU`.
 * Backward pass supports `f32` and `bf16` data types.
 
-
-#### Using SYCL Kernels
-
-The implementation supports both forward and backward directions.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
 ##### Forward direction
 * Supported data types for source and destination: `f32`, `bf16`, `f16`, `s8`
 * Supported data types for mean and variance: `f32`
@@ -210,15 +203,6 @@ limitations when using Nvidia backend for eltwise primitive:
   not supported for `s8`.
 * Backward pass supports `f32` and `bf16` data types.
 
-#### Using SYCL Kernels
-
-The implementation supports both forward and backward directions.
-
-* Supported algorithms: `abs`, `clip`, `clip_v2`, `elu`, `exp`, `gelu_erf`,
-`gelu_tanh`, `hardsigmoid`, `hardswish`, `linear`, `log`, `logistic`, `mish`,
-`pow`, `relu`, `round`, `soft_relu`, `sqrt`, `square`,`swish` and `tanh`
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N`
-
 ##### Forward Direction
 * Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8` and `u8`
 * Supported post-ops: `binary`
@@ -284,12 +268,6 @@ backward propagation respectively.
 * cuDNN supports NCHW tensor formats for all valid dimensions. However, it does
   not support the NHWC tensor format for above 5 dimensions.
 
-#### Using SYCL Kernels
-
-The implementation supports both forward and backward directions.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
 ##### Forward Direction
 * Supported data types: `f32`, `bf16`, `f16`
 * Supported algorithms: `ACROSS`, `WITHIN`
@@ -332,12 +310,6 @@ backward propagation respectively.
 
 * Supported data type are `f32`, `f16`, `bf16` and `s8`.
 
-#### Using SYCL Kernels
-
-The implementation supports both forward and backward directions.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`
-
 ##### Forward Direction
 * Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` and `s32`
 * Supported post-ops: `binary`, `eltwise_linear`
@@ -347,17 +319,6 @@ The implementation supports both forward and backward directions.
 * Supported data types: `f32`, `bf16`, `f16`
 * Supported algorithms: `max`, `avg_p`, `avg_np`
 
-### PReLU
-
-The PReLU primitive (Leaky ReLU with a trainable alpha parameter) is implemented
-using SYCL kernels. The primitive supports both forward and backward
-propagations.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
-* Forward pass supports `f32`, `f16`, `bf16`, `s8` and `u8` data types
-* Backward pass supports `f32` and `bf16` data types
-
 ### Layer Normalization
 
 The Primitive layer normalization is implemented through SYCL kernels.The implementation supports both forward and backward directions.
@@ -377,7 +338,7 @@ The Primitive layer normalization is implemented through SYCL kernels.The implem
 ### Reorder
 
 The `cudnnTransform` function is the equivalent of oneDNN reorder function.
-However, there are some limitations when using SYCL_API-DNN reorder on Nvidia
+However, there are some limitations when using reorder on Nvidia
 GPU:
 
 * Per dimension scaling is not supported (a single alpha and beta value is
@@ -417,12 +378,6 @@ where all the data locations are normalized between `-1 <= (xi, yi) <= 1`.
   oneDNN, a dummy memory for `d_grid` is created and is deleted when the
   destructor of the primitive is called.
 
-#### Using SYCL Kernels
-
-The implementation supports both forward and backward directions.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`
-
 ##### Forward Direction
 * Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` and `s32`
 * Supported post-ops: `sum`, `eltwise`, `binary`
@@ -448,12 +403,6 @@ changed to `CUDNN_SOFTMAX_LOG`.
 * Forward pass supports `f32`, `f16`, `bf16` and `s8` data types.
 * Backward pass supports `f32` and `bf16` data types.
 
-#### Using SYCL Kernels
-
-The implementation supports both forward and backward directions.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
 ##### Forward Direction
 * Supported data types: `f32`, `bf16`, `f16`, `u8` and `s8`
 * Supported scales: common scales for `s8` and `u8` data types
@@ -466,18 +415,7 @@ The implementation supports both forward and backward directions.
 The sum operation uses the reorder primitive to sum tensors, so the same
 limitation as reorder applies here.
 
-### Shuffle
-
-The shuffle primitive is implemented using SYCL kernels.
-This primitive supports both forward and backward propagations.
-
-* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
-* Forward pass supports `f32`, `f16`, `bf16` and `s8` data types.
-* Backward pass supports `f32` and `bf16` data types.
-
 ### Other primitives
 
-Rest primitives not listed above are not supported by Nvidia backend. This is
-likely due to either missed functionality in cuDNN or cuBLAS, or lack of
-priority in supporting of such functionality.
+Some missing primitives/features are supported through
+[generic SYCL kernels](../generic/sycl/README.md).