gpu: amd: Enable SYCL kernels

oneapi-src · Aug 7, 2024 · fe16e2e · fe16e2e
1 parent 2b97e32
commit fe16e2e
Show file tree

Hide file tree

Showing 19 changed files with 75 additions and 53 deletions.
diff --git a/cmake/options.cmake b/cmake/options.cmake
@@ -159,13 +159,13 @@ set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
       SSE41 < AVX2 < AVX512 < AMX (or ALL). It means that if user selects, e.g.
       AVX2 ISA, SSE41 kernels will also present at build time.")
 
-set(DNNL_HIP_TARGET_ARCH "" CACHE STRING
-    "Specifies the target offload architecture. Required when compiling
-    sycl kernels for AMD target backend.")
-if("${DNNL_HIP_TARGET_ARCH}" STREQUAL "")
-    add_definitions(-DDNNL_HIP_DISABLE_SYCL_KERNELS=1)
-    set(DNNL_HIP_DISABLE_SYCL_KERNELS TRUE)
-endif()
+set(DNNL_AMD_SYCL_KERNELS_TARGET_ARCH "" CACHE STRING
+    "Specifies the target architecture (e.g. gfx90a when compiling on AMD MI210)
+    to be used for compiling generic SYCL kernels for AMD vendor.
+    When this option is set to a valid architecture (see LLVM target column in
+    https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus
+    for supported architectures), the generic SYCL kernels will be enabled for AMD
+    vendor. If not set, the SYCL kernels will not be compiled.")
 
 # =============
 # Optimizations
@@ -317,12 +317,8 @@ endif()
 
 if(DNNL_GPU_RUNTIME STREQUAL "DPCPP" OR DNNL_GPU_RUNTIME STREQUAL "SYCL")
     set(DNNL_GPU_SYCL true)
-    set(DNNL_SYCL_INTEL OFF)
     set(DNNL_SYCL_CUDA OFF)
     set(DNNL_SYCL_HIP OFF)
-    if(DNNL_GPU_VENDOR STREQUAL "INTEL")
-        set(DNNL_SYCL_INTEL ON)
-    endif()
     if(DNNL_GPU_VENDOR STREQUAL "NVIDIA")
         set(DNNL_SYCL_CUDA ON)
     endif()
@@ -339,6 +335,11 @@ else()
     set(DNNL_WITH_SYCL false)
 endif()
 
+if(DNNL_SYCL_HIP AND NOT "${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}" STREQUAL "")
+    add_definitions(-DDNNL_AMD_ENABLE_SYCL_KERNELS=1)
+    set(DNNL_AMD_ENABLE_SYCL_KERNELS TRUE)
+endif()
+
 # =============
 # Miscellaneous
 # =============

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -27,9 +27,8 @@ if(DNNL_SYCL_CUDA)
     append(CMAKE_CXX_FLAGS "-Wno-linker-warnings")
 endif()
 
-if(DNNL_SYCL_HIP AND NOT DNNL_HIP_DISABLE_SYCL_KERNELS)
-    append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_HIP_TARGET_ARCH}")
-    append(CMAKE_CXX_FLAGS "-Wno-linker-warnings")
+if (DNNL_AMD_ENABLE_SYCL_KERNELS)
+    append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
 endif()
 
 # propagate sanitizer flags

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -143,9 +143,9 @@ if(DNNL_WITH_SYCL)
         append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda")
         append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda")
     endif()
-    if(DNNL_SYCL_HIP AND NOT DNNL_HIP_DISABLE_SYCL_KERNELS)
-        append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_HIP_TARGET_ARCH}")
-        append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_HIP_TARGET_ARCH}")
+    if(DNNL_AMD_ENABLE_SYCL_KERNELS)
+        append(CMAKE_SHARED_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
+        append(CMAKE_STATIC_LINKER_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
     endif()
 endif()
 

diff --git a/src/gpu/generic/CMakeLists.txt b/src/gpu/generic/CMakeLists.txt
@@ -19,17 +19,13 @@ file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
     )
 
-if(DNNL_SYCL_INTEL)
-    add_subdirectory(sycl)
-endif()
-
 if(DNNL_SYCL_CUDA)
     add_subdirectory(sycl)
 endif()
 
-# Only enable the generic SYCL kernels on AMD backend
-# if architecture has been specified
-if(DNNL_SYCL_HIP AND NOT DNNL_HIP_DISABLE_SYCL_KERNELS)
+# Only enable the generic SYCL kernels for AMD vendor
+# if target architecture has been specified
+if(DNNL_AMD_ENABLE_SYCL_KERNELS)
     add_subdirectory(sycl)
 endif()
 

diff --git a/src/gpu/generic/sycl/CMakeLists.txt b/src/gpu/generic/sycl/CMakeLists.txt
@@ -27,8 +27,8 @@ if(DNNL_SYCL_CUDA)
     append(CMAKE_CXX_FLAGS "-Wno-linker-warnings")
 endif()
 
-if(DNNL_SYCL_HIP AND NOT DNNL_HIP_DISABLE_SYCL_KERNELS)
-    append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_HIP_TARGET_ARCH}")
+if(DNNL_AMD_ENABLE_SYCL_KERNELS)
+    append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
 endif()
 
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_generic_sycl)

diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_batch_normalization.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp"
 #include "gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp"
@@ -39,6 +37,10 @@
 #include "gpu/amd/miopen_batch_normalization.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_batch_normalization.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_binary.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_binary.hpp"
 #include "gpu/intel/ocl/multi_po_reorder_binary.hpp"
@@ -32,6 +30,10 @@
 #include "gpu/amd/miopen_binary.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_binary.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_convolution.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/binary_format.hpp"
 #include "gpu/intel/jit/conv/gen_convolution.hpp"
@@ -38,6 +36,10 @@
 #include "gpu/amd/miopen_convolution.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_convolution.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_eltwise.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_eltwise.hpp"
 #include "gpu/intel/ocl/ref_eltwise.hpp"
@@ -31,6 +29,10 @@
 #include "gpu/amd/miopen_eltwise.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_eltwise.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp
@@ -77,10 +77,8 @@ namespace gpu {
 // NOTE: Support for the standalone GENERIC vendor has not been added yet.
 #if defined(DNNL_WITH_SYCL) \
         && ((DNNL_GPU_VENDOR == DNNL_VENDOR_GENERIC) \
-                || (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL) \
                 || (DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA) \
-                || (DNNL_GPU_VENDOR == DNNL_VENDOR_AMD \
-                        && DNNL_HIP_DISABLE_SYCL_KERNELS != 1))
+                || (DNNL_AMD_ENABLE_SYCL_KERNELS == 1))
 #define DNNL_GPU_GENERIC_SYCL_ONLY(...) __VA_ARGS__
 #else
 #define DNNL_GPU_GENERIC_SYCL_ONLY(...)

diff --git a/src/gpu/gpu_layer_normalization_list.cpp b/src/gpu/gpu_layer_normalization_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_layer_normalizations.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_layer_normalization.hpp"
 #include "gpu/intel/ocl/reusable_lnorm.hpp"
@@ -26,6 +24,10 @@
 #include "gpu/intel/ocl/vectorized_lnorm.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_layer_normalizations.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_lrn.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_lrn.hpp"
 #endif
@@ -30,6 +28,10 @@
 #include "gpu/amd/miopen_lrn.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_lrn.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp
@@ -18,8 +18,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_pooling.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/pooling/gen_pooling.hpp"
 #include "gpu/intel/ocl/gen9_global_pooling.hpp"
@@ -35,6 +33,10 @@
 #include "gpu/amd/miopen_pooling.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_pooling.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_prelu_list.cpp b/src/gpu/gpu_prelu_list.cpp
@@ -16,13 +16,16 @@
 
 #include "common/compiler_workarounds.hpp"
 
-#include "gpu/generic/sycl/ref_prelu.hpp"
 #include "gpu/gpu_impl_list.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_prelu.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_prelu.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp
@@ -18,7 +18,6 @@
 
 #include "gpu/generic/cross_engine_reorder.hpp"
 #include "gpu/generic/direct_copy.hpp"
-#include "gpu/generic/sycl/ref_reorder.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/reorder/gen_reorder.hpp"
@@ -36,6 +35,9 @@
 #include "gpu/amd/miopen_reorder.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_reorder.hpp"
+#endif
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_resampling.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_resampling.hpp"
 #include "gpu/intel/ocl/vectorized_resampling.hpp"
@@ -27,6 +25,10 @@
 #include "gpu/nvidia/cudnn_resampling.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_resampling.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_shuffle_list.cpp b/src/gpu/gpu_shuffle_list.cpp
@@ -16,13 +16,15 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_shuffle.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_shuffle.hpp"
 #include "gpu/intel/ocl/shuffle_by_reorder.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_shuffle.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp
@@ -16,8 +16,6 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#include "gpu/generic/sycl/ref_softmax.hpp"
-
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_softmax.hpp"
 #include "gpu/intel/ocl/reusable_softmax.hpp"
@@ -32,6 +30,10 @@
 #include "gpu/amd/miopen_softmax.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_softmax.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {

diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp
@@ -20,8 +20,6 @@
 #include "gpu/gpu_sum_pd.hpp"
 
 #include "gpu/generic/ref_sum.hpp"
-#include "gpu/generic/sycl/ref_sum.hpp"
-#include "gpu/generic/sycl/ref_sum_many_inputs.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/gen9_simple_sum.hpp"
@@ -35,6 +33,11 @@
 #include "gpu/nvidia/cudnn_sum.hpp"
 #endif
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA || DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
+#include "gpu/generic/sycl/ref_sum.hpp"
+#include "gpu/generic/sycl/ref_sum_many_inputs.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {